tree = read.tree("/Users/kosukesano/bio/240903_ASTRAL.tre")
p=ggtree(tree)+
xlim(0, 7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
p備忘録
バイオインフォマ関係の覚書
基本的な事項
ssh gw.ddbj.nig.ac.jp
#スパコンへのログイン。この後にqloginを実行して作業ノードへ移る
scp ~/Desctop/hoge kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/bio/
#ローカルの~/Desctop/にあるhogeというファイルを遺伝研スパコンの/home/kosukesano/bio/にコピーする。ローカルで実行する。ディレクトリをコピーする場合はscp -rとする。
scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/bio/hoge ~/Desctop/
#遺伝研スパコンの/home/kosukesano/bio/にあるhogeというファイルをローカルの~/Desctop/にコピーする。**ローカルで実行する。**
source ~/tools/pyenv_env/braker_profile
# braker環境を立ち上げる際、初めに行う。pyenvとcondaにパスを通し、conda環境に入った後にbraker環境に入る。
source ~/tools/pyenv_env/EDTA_profile
# EDTA環境を立ち上げる際、初めに行う。pyenvとconda、mambaにパスを通し、mambaforge環境に入った後にEDTA環境に入る。
source ~/pyenv_conda_environment/.pyenv_profile
# pyenvを実行する際、初めに行う。パスを通す。
source ~/tools/pyenv_env/ETE_profile
# ETEを使う際、初めに行う。ETE用の環境に入る。
ssh scorpion
#牧野研スコーピオンサーバーへのログイン。2024年4月
0430
遺伝研スパコンのホームディレクトリの中身を全てHDDに移した。場所は /Volumes/Elements/240430_ddbj_backup/kosukesano$
# 実行したコード
$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/ /Volumes/Elements/240430_ddbj_backup2024年5月
0501
遺伝研スパコンのAnaconda3とminiforgeを削除した。またホームディレクトリにあったファイルも(ディレクトリ以外は)削除した。
# Anacondaのアンインストール
$ conda install anaconda-clean
$ anaconda-clean
$ rm -fr /anaconda3# miniforgeのアンインストール
$ rm -rf ~/.conda
# ~/.local/bin/ になぜかmambaがいたので、
$ rm mamba遺伝研の環境初期化ページに倣い、.bashrcと.bash_profileを書き直した。遺伝研初期化ページ参考
変更前の.bash_profile
# .bash_profile
# Get the aliases and functions
#
if [ -f ~/.bashrc ]; then
. ~/.bashrc
fi
##############################
source ~/.bashrc
#############################
# User specific environment and startup programs
PATH=$PATH:$HOME/.local/bin:$HOME/bin
export PATH
export GENEMARK_PATH=/home/kosukesano/local/gmes_linux_64_4
export PROTHINT_PATH=/home/kosukesano/local/gmes_linux_64_4/ProtHint/bin
export ALIGNMENT_TOOL_PATH=/home/kosukesano/local/spaln-master
export CDBTOOLS_PATH=/home/kosukesano/local/cdbfasta-master
#source ~/.bash_profile変更後の.bash_profile
# .bash_profile
# Get the aliases and functions
if [ -f ~/.bashrc ]; then
. ~/.bashrc
fi
# User specific environment and startup programs
PATH=$PATH:$HOME/.local/bin:$HOME/bin
export PATH変更前の.bashrc
# .bashrc
# Source global definitions
if [ -f /etc/bashrc ]; then
. /etc/bashrc
fi
##########################################
# If this variable is already set, skip the rest of the script
if [ -n "$BASHRC_LOADED" ]; then
return
fi
# Set the variable to indicate that the script has been loaded
BASHRC_LOADED=1
# ---
# ~/.bashrc: executed by bash(1) for non-login shells.
# see /usr/share/doc/bash/examples/startup-files (in the package bash-doc)
# for examples
# If not running interactively, don't do anything
case $- in
*i*) ;;
*) return;;
esac
# don't put duplicate lines or lines starting with space in the history.
# See bash(1) for more options
HISTCONTROL=ignoreboth
# append to the history file, don't overwrite it
shopt -s histappend
# for setting history length see HISTSIZE and HISTFILESIZE in bash(1)
HISTSIZE=1000
HISTFILESIZE=2000
# check the window size after each command and, if necessary,
# update the values of LINES and COLUMNS.
shopt -s checkwinsize
# If set, the pattern "**" used in a pathname expansion context will
# match all files and zero or more directories and subdirectories.
#shopt -s globstar
# make less more friendly for non-text input files, see lesspipe(1)
[ -x /usr/bin/lesspipe ] && eval "$(SHELL=/bin/sh lesspipe)"
# set variable identifying the chroot you work in (used in the prompt below)
if [ -z "${debian_chroot:-}" ] && [ -r /etc/debian_chroot ]; then
debian_chroot=$(cat /etc/debian_chroot)
fi
# set a fancy prompt (non-color, unless we know we "want" color)
case "$TERM" in
xterm-color|*-256color) color_prompt=yes;;
esac
# uncomment for a colored prompt, if the terminal has the capability; turned
# off by default to not distract the user: the focus in a terminal window
# should be on the output of commands, not on the prompt
#force_color_prompt=yes
if [ -n "$force_color_prompt" ]; then
if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then
# We have color support; assume it's compliant with Ecma-48
# (ISO/IEC-6429). (Lack of such support is extremely rare, and such
# a case would tend to support setf rather than setaf.)
color_prompt=yes
else
color_prompt=
fi
fi
if [ "$color_prompt" = yes ]; then
PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
else
PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ '
fi
unset color_prompt force_color_prompt
# If this is an xterm set the title to user@host:dir
case "$TERM" in
xterm*|rxvt*)
PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1"
;;
*)
;;
esac
# enable color support of ls and also add handy aliases
if [ -x /usr/bin/dircolors ]; then
test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
alias ls='ls --color=auto'
#alias dir='dir --color=auto'
#alias vdir='vdir --color=auto'
alias grep='grep --color=auto'
alias fgrep='fgrep --color=auto'
alias egrep='egrep --color=auto'
fi
# colored GCC warnings and errors
#export GCC_COLORS='error=01;31:warning=01;35:note=01;36:caret=01;32:locus=01:quote=01'
# some more ls aliases
alias ll='ls -alF'
alias la='ls -A'
alias l='ls -CF'
# Add an "alert" alias for long running commands. Use like so:
# sleep 10; alert
alias alert='notify-send --urgency=low -i "$([ $? = 0 ] && echo terminal || echo error)" "$(history|tail -n1|sed -e '\''s/^\s*[0-9]\+\s*//;s/[;&|]\s*alert$//'\'')"'
# Alias definitions.
# You may want to put all your additions into a separate file like
# ~/.bash_aliases, instead of adding them here directly.
# See /usr/share/doc/bash-doc/examples in the bash-doc package.
if [ -f ~/.bash_aliases ]; then
. ~/.bash_aliases
fi
# enable programmable completion features (you don't need to enable
# this, if it's already enabled in /etc/bash.bashrc and /etc/profile
# sources /etc/bash.bashrc).
if ! shopt -oq posix; then
if [ -f /usr/share/bash-completion/bash_completion ]; then
. /usr/share/bash-completion/bash_completion
elif [ -f /etc/bash_completion ]; then
. /etc/bash_completion
fi
fi
#########################################
# Uncomment the following line if you don't like systemctl's auto-paging feature:
# export SYSTEMD_PAGER=
# User specific aliases and functions
#module load gcc
# <<< conda initialize <<<
# enable color support of ls and also add handy aliases
if [ -x /usr/bin/dircolors ]; then
test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
alias ls='ls --color=auto'
#alias dir='dir --color=auto'
#alias vdir='vdir --color=auto'
alias grep='grep --color=auto'
alias fgrep='fgrep --color=auto'
alias egrep='egrep --color=auto'
fi変更後の.bashrc
# .bashrc
# Source global definitions
if [ -f /etc/bashrc ]; then
. /etc/bashrc
fi
# Uncomment the following line if you don't like systemctl's auto-paging feature:
# export SYSTEMD_PAGER=
# User specific aliases and functions
module load gcc元の.bashrcにあった記述のうち、色に関わる内容をもう一度記載。
.bashrcへの加筆内容
###ここから下は主に書き加えた部分###
####################################################################################
# set a fancy prompt (non-color, unless we know we "want" color)
case "$TERM" in
xterm-color|*-256color) color_prompt=yes;;
esac
####################################################################################
#↑により、カラー対応の端末で Bash を実行している場合にのみ、色付きのプロンプトが表示される
###################################################################################
# uncomment for a colored prompt, if the terminal has the capability; turned
# off by default to not distract the user: the focus in a terminal window
# should be on the output of commands, not on the prompt
#force_color_prompt=yes
if [ -n "$force_color_prompt" ]; then
if [ -x /usr/bin/tput ] && tput setaf 1 >&/dev/null; then
# We have color support; assume it's compliant with Ecma-48
# (ISO/IEC-6429). (Lack of such support is extremely rare, and such
# a case would tend to support setf rather than setaf.)
color_prompt=yes
else
color_prompt=
fi
fi
if [ "$color_prompt" = yes ]; then
PS1='${debian_chroot:+($debian_chroot)}\[\033[01;32m\]\u@\h\[\033[00m\]:\[\033[01;34m\]\w\[\033[00m\]\$ '
else
PS1='${debian_chroot:+($debian_chroot)}\u@\h:\w\$ '
fi
unset color_prompt force_color_prompt
# If this is an xterm set the title to user@host:dir
case "$TERM" in
xterm*|rxvt*)
PS1="\[\e]0;${debian_chroot:+($debian_chroot)}\u@\h: \w\a\]$PS1"
;;
*)
;;
esac
####################################################################################
#↑ターミナルプロンプトの色や表示を設定する
#####################################################################################
# enable color support of ls and also add handy aliases
if [ -x /usr/bin/dircolors ]; then
test -r ~/.dircolors && eval "$(dircolors -b ~/.dircolors)" || eval "$(dircolors -b)"
alias ls='ls --color=auto'
#alias dir='dir --color=auto'
#alias vdir='vdir --color=auto'
alias grep='grep --color=auto'
alias fgrep='fgrep --color=auto'
alias egrep='egrep --color=auto'
fi
#######################################################################################
#↑ls および grep コマンドに色のサポートをつけるまた、ログイン時にGCCに関わる部分で以下のエラーが発生した。
[kosukesano@gwB1 ~]$ qlogin
Your job 25915671 ("QLOGIN") has been submitted
waiting for interactive job to be scheduled ...
Your interactive job 25915671 has been successfully scheduled.
Establishing /home/geadmin/AGER/utilbin/lx-amd64/qlogin_wrapper session to host at137 ...
Warning: Permanently added '[at137]:41669,[172.19.7.185]:41669' (ECDSA) to the list of known hosts.
Welcome to Ubuntu 22.04.3 LTS (GNU/Linux 5.15.0-87-generic x86_64)
* Documentation: https://help.ubuntu.com
* Management: https://landscape.canonical.com
* Support: https://ubuntu.com/advantage
System information as of Wed May 1 16:06:51 JST 2024
System load: 3.0078125 Users logged in: 34
Usage of /: 32.8% of 823.03GB IPv4 address for eno1: 172.19.18.185
Memory usage: 34% IPv4 address for eno2: 192.168.50.187
Swap usage: 27% IPv4 address for ibp161s0: 172.19.7.185
Processes: 2133
=> There is 1 zombie process.
* Strictly confined Kubernetes makes edge and IoT secure. Learn how MicroK8s
just raised the bar for easy, resilient and secure K8s cluster deployment.
https://ubuntu.com/engage/secure-kubernetes-at-the-edge
Expanded Security Maintenance for Applications is not enabled.
11 updates can be applied immediately.
To see these additional updates run: apt list --upgradable
56 additional security updates can be applied with ESM Apps.
Learn more about enabling ESM Apps service at https://ubuntu.com/esm
Last login: Tue Apr 30 10:14:26 2024 from 172.19.7.250
ERROR: Unable to locate a modulefile for 'gcc'かつての.bashrcを見るとmodule load gccという記述がコメントアウトされていた。暫定的に今回もコメントアウトしておく。
# User specific aliases and functions
#module load gcc
#↑この部分でなんかエラーが出たんだけど、昔のbashrcはコメントアウトしちゃってたので同様の処置をとった。0502
pyenvのインストール
kosukesano@at139:~$ git clone git://github.com/yyuu/pyenv.git ~/.pyenv
# pyenvをgitでインストール.bash_profileなどにパスを書くと何かミスがあった場合重大なことになるため、別のプロファイルを作ってそこにパスを書く
kosukesano@at139:~$ mkdir pyenv_conda_environment
# pyenv_conda_environmentというディレクトリをホーム直下に作成
kosukesano@at139:~$ cd pyenv_conda_environment/
kosukesano@at139:~/pyenv_conda_environment$ nano .pyenv_profile
# .pyenv_profileというファイルを作成.pyenv_profileの中身
export PYENV_ROOT="$HOME/.pyenv"
export PATH="$PYENV_ROOT/bin:$PATH"
eval "$(pyenv init -)"この後、pyenvを打ってもCommand not foundと出てしまうが、source ~/pyenv_conda_environment/.pyenv_profileで先ほどのプロファイルをソースすると、pyenvが機能するようになる。
今後もpyenvを使う場合は毎回初めにsource ~/pyenv_conda_environment/.pyenv_profileを行う。
kosukesano@at137:~$ pyenv
Command 'pyenv' not found, did you mean:
command 'p7env' from deb libnss3-tools (2:3.68.2-0ubuntu1.2)
Try: apt install <deb name>
kosukesano@at137:~$ source ~/pyenv_conda_environment/.pyenv_profile
kosukesano@at137:~$ pyenv
pyenv 2.4.0-3-g3ff54e89
Usage: pyenv <command> [<args>]
Some useful pyenv commands are:
--version Display the version of pyenv
.
.
.
.
.pyenvにてanaconda3環境の構築
kosukesano@at137:~$ pyenv install anaconda3-2023.09-0↑を作業ノード@137で実行したらうまくいかなかった。conda.exeが作業ノードに高負荷を与えていると遺伝研の方から言われた。
原因 condaが重い
メモリをめちゃくちゃ増やしたらなんとかなった
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 1
#$ -l s_vmem=48G
#$ -l mem_req=48G
date
echo starting at date
source ~/pyenv_conda_environment/.pyenv_profile
pyenv install anaconda3-2020.11
date/home/kosukesano/.pyenv/versionにanaconda3-2020.11を作成した。
~/tools/pyenv_envを作成、その下にbraker_profileを作成した。
source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global anaconda3-2020.11
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/kosukesano/.pyenv/versions/anaconda3-2020.11/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/home/kosukesano/.pyenv/versions/anaconda3-2020.11/etc/profile.d/conda.sh" ]; then
. "/home/kosukesano/.pyenv/versions/anaconda3-2020.11/etc/profile.d/conda.sh"
else
export PATH="/home/kosukesano/.pyenv/versions/anaconda3-2020.11/bin:$PATH"
fi
fi
unset __conda_setup
# <<< conda initialize <<<
conda activate brakerbrakerのインストールの前準備
うまくいったやつ
conda install -c anaconda perl
conda install -c anaconda biopythonうまくいかなかったやつ
conda install -c bioconda perl-app-cpanminus
conda install -c bioconda perl-file-spec
conda install -c bioconda perl-hash-merge
conda install -c bioconda perl-list-util
conda install -c bioconda perl-module-load-conditional
conda install -c bioconda perl-posix
conda install -c bioconda perl-file-homedir
conda install -c bioconda perl-parallel-forkmanager
conda install -c bioconda perl-scalar-util-numeric
conda install -c bioconda perl-yaml
conda install -c bioconda perl-class-data-inheritable
conda install -c bioconda perl-exception-class(braker) kosukesano@at137:~/tools/braker$ conda install -c bioconda perl-list-util
Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
ResolvePackageNotFound:
- python=3.1
(braker) kosukesano@at137:~/tools/braker$ conda install -c bioconda perl-module-load-conditional
Collecting package metadata (current_repodata.json): done
Solving environment: failed with initial frozen solve. Retrying with flexible solve.
Solving environment: failed with repodata from current_repodata.json, will retry with next repodata source.
ResolvePackageNotFound:
- python=3.1こんな感じのエラーが出てインストールできなかった
0507
現在のbraker環境を削除
(braker) kosukesano@at137:~$ conda deactivate
(base) kosukesano@at137:~$ conda remove -n braker --all
Remove all packages in environment /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker:
## Package Plan ##
environment location: /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker
The following packages will be REMOVED:
_libgcc_mutex-0.1-main
_openmp_mutex-5.1-1_gnu
biopython-1.78-py312h5eee18b_0
blas-1.0-mkl
bzip2-1.0.8-h7b6447c_0
ca-certificates-2023.08.22-h06a4308_0
expat-2.5.0-h6a678d5_0
gdbm-1.18-hd4cb3f1_4
intel-openmp-2023.1.0-hdb19cb5_46306
ld_impl_linux-64-2.38-h1181459_1
libffi-3.4.4-h6a678d5_0
libgcc-ng-11.2.0-h1234567_1
libgomp-11.2.0-h1234567_1
libstdcxx-ng-11.2.0-h1234567_1
libuuid-1.41.5-h5eee18b_0
mkl-2023.1.0-h213fc3f_46344
mkl-service-2.4.0-py312h5eee18b_1
ncurses-6.4-h6a678d5_0
numpy-1.26.0-py312hc5e2394_0
numpy-base-1.26.0-py312h0da6c21_0
openssl-3.0.12-h7f8727e_0
perl-5.34.0-h5eee18b_2
pip-23.3-py312h06a4308_0
python-3.12.0-h996f2a0_0
readline-8.2-h5eee18b_0
setuptools-68.0.0-py312h06a4308_0
sqlite-3.41.2-h5eee18b_0
tbb-2021.8.0-hdb19cb5_0
tk-8.6.12-h1ccaba5_0
tzdata-2023c-h04d1e81_0
wheel-0.41.2-py312h06a4308_0
xz-5.4.2-h5eee18b_0
zlib-1.2.13-h5eee18b_0
Proceed ([y]/n)? y
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
(base) kosukesano@at137改めてbraker環境を構築、python=3.9に指定
(base) kosukesano@at137:~/tools$ conda create -n braker python=3.9
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 4.9.2
latest version: 24.4.0
Please update conda by running
$ conda update -n base -c defaults conda
## Package Plan ##
environment location: /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker
added / updated specs:
- python=3.9
The following packages will be downloaded:
package | build
---------------------------|-----------------
_libgcc_mutex-0.1 | main 3 KB
_openmp_mutex-5.1 | 1_gnu 21 KB
ca-certificates-2024.3.11 | h06a4308_0 127 KB
ld_impl_linux-64-2.38 | h1181459_1 654 KB
libffi-3.4.4 | h6a678d5_1 141 KB
libgcc-ng-11.2.0 | h1234567_1 5.3 MB
libgomp-11.2.0 | h1234567_1 474 KB
libstdcxx-ng-11.2.0 | h1234567_1 4.7 MB
ncurses-6.4 | h6a678d5_0 914 KB
openssl-3.0.13 | h7f8727e_1 5.2 MB
pip-23.3.1 | py39h06a4308_0 2.6 MB
python-3.9.19 | h955ad1f_1 25.1 MB
readline-8.2 | h5eee18b_0 357 KB
setuptools-69.5.1 | py39h06a4308_0 1003 KB
sqlite-3.45.3 | h5eee18b_0 1.2 MB
tk-8.6.14 | h39e8969_0 3.4 MB
tzdata-2024a | h04d1e81_0 116 KB
wheel-0.43.0 | py39h06a4308_0 109 KB
xz-5.4.6 | h5eee18b_1 643 KB
zlib-1.2.13 | h5eee18b_1 111 KB
------------------------------------------------------------
Total: 52.2 MB
The following NEW packages will be INSTALLED:
_libgcc_mutex pkgs/main/linux-64::_libgcc_mutex-0.1-main
_openmp_mutex pkgs/main/linux-64::_openmp_mutex-5.1-1_gnu
ca-certificates pkgs/main/linux-64::ca-certificates-2024.3.11-h06a4308_0
ld_impl_linux-64 pkgs/main/linux-64::ld_impl_linux-64-2.38-h1181459_1
libffi pkgs/main/linux-64::libffi-3.4.4-h6a678d5_1
libgcc-ng pkgs/main/linux-64::libgcc-ng-11.2.0-h1234567_1
libgomp pkgs/main/linux-64::libgomp-11.2.0-h1234567_1
libstdcxx-ng pkgs/main/linux-64::libstdcxx-ng-11.2.0-h1234567_1
ncurses pkgs/main/linux-64::ncurses-6.4-h6a678d5_0
openssl pkgs/main/linux-64::openssl-3.0.13-h7f8727e_1
pip pkgs/main/linux-64::pip-23.3.1-py39h06a4308_0
python pkgs/main/linux-64::python-3.9.19-h955ad1f_1
readline pkgs/main/linux-64::readline-8.2-h5eee18b_0
setuptools pkgs/main/linux-64::setuptools-69.5.1-py39h06a4308_0
sqlite pkgs/main/linux-64::sqlite-3.45.3-h5eee18b_0
tk pkgs/main/linux-64::tk-8.6.14-h39e8969_0
tzdata pkgs/main/noarch::tzdata-2024a-h04d1e81_0
wheel pkgs/main/linux-64::wheel-0.43.0-py39h06a4308_0
xz pkgs/main/linux-64::xz-5.4.6-h5eee18b_1
zlib pkgs/main/linux-64::zlib-1.2.13-h5eee18b_1
Proceed ([y]/n)? y
Downloading and Extracting Packages
tk-8.6.14 | 3.4 MB | #################################################################################################################################################### | 100%
ca-certificates-2024 | 127 KB | #################################################################################################################################################### | 100%
libffi-3.4.4 | 141 KB | #################################################################################################################################################### | 100%
_openmp_mutex-5.1 | 21 KB | #################################################################################################################################################### | 100%
xz-5.4.6 | 643 KB | #################################################################################################################################################### | 100%
ld_impl_linux-64-2.3 | 654 KB | #################################################################################################################################################### | 100%
sqlite-3.45.3 | 1.2 MB | #################################################################################################################################################### | 100%
python-3.9.19 | 25.1 MB | #################################################################################################################################################### | 100%
openssl-3.0.13 | 5.2 MB | #################################################################################################################################################### | 100%
pip-23.3.1 | 2.6 MB | #################################################################################################################################################### | 100%
libgcc-ng-11.2.0 | 5.3 MB | #################################################################################################################################################### | 100%
setuptools-69.5.1 | 1003 KB | #################################################################################################################################################### | 100%
zlib-1.2.13 | 111 KB | #################################################################################################################################################### | 100%
wheel-0.43.0 | 109 KB | #################################################################################################################################################### | 100%
libgomp-11.2.0 | 474 KB | #################################################################################################################################################### | 100%
tzdata-2024a | 116 KB | #################################################################################################################################################### | 100%
readline-8.2 | 357 KB | #################################################################################################################################################### | 100%
_libgcc_mutex-0.1 | 3 KB | #################################################################################################################################################### | 100%
libstdcxx-ng-11.2.0 | 4.7 MB | #################################################################################################################################################### | 100%
ncurses-6.4 | 914 KB | #################################################################################################################################################### | 100%
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
#
# To activate this environment, use
#
# $ conda activate braker
#
# To deactivate an active environment, use
#
# $ conda deactivate
(base) kosukesano@at137:~/tools$ source ~/tools/pyenv_env/braker_profile
(braker) kosukesano@at137:~/tools$環境はこうなった
(braker) kosukesano@at137:~/tools$ conda info
active environment : braker
active env location : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker
shell level : 2
user config file : /home/kosukesano/.condarc
populated config files :
conda version : 4.9.2
conda-build version : 3.20.5
python version : 3.8.5.final.0
virtual packages : __glibc=2.35=0
__unix=0=0
__archspec=1=x86_64
base environment : /home/kosukesano/.pyenv/versions/anaconda3-2020.11 (writable)
channel URLs : https://repo.anaconda.com/pkgs/main/linux-64
https://repo.anaconda.com/pkgs/main/noarch
https://repo.anaconda.com/pkgs/r/linux-64
https://repo.anaconda.com/pkgs/r/noarch
package cache : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/pkgs
/home/kosukesano/.conda/pkgs
envs directories : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs
/home/kosukesano/.conda/envs
platform : linux-64
user-agent : conda/4.9.2 requests/2.24.0 CPython/3.8.5 Linux/5.15.0-87-generic ubuntu/22.04.3 glibc/2.35
UID:GID : 6811:10086
netrc file : None
offline mode : False
(braker) kosukesano@at137:~/tools$昔はこうだったんだけど、なんか変わったんか?
(braker) kosukesano@at137:~$ conda info
active environment : braker
active env location : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker
shell level : 2
user config file : /home/kosukesano/.condarc
populated config files :
conda version : 4.9.2
conda-build version : 3.20.5
python version : 3.8.5.final.0
virtual packages : __glibc=2.35=0
__unix=0=0
__archspec=1=x86_64
base environment : /home/kosukesano/.pyenv/versions/anaconda3-2020.11 (writable)
channel URLs : https://repo.anaconda.com/pkgs/main/linux-64
https://repo.anaconda.com/pkgs/main/noarch
https://repo.anaconda.com/pkgs/r/linux-64
https://repo.anaconda.com/pkgs/r/noarch
package cache : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/pkgs
/home/kosukesano/.conda/pkgs
envs directories : /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs
/home/kosukesano/.conda/envs
platform : linux-64
user-agent : conda/4.9.2 requests/2.24.0 CPython/3.8.5 Linux/5.15.0-87-generic ubuntu/22.04.3 glibc/2.35
UID:GID : 6811:10086
netrc file : None
offline mode : False
(braker) kosukesano@at137:~$改めてbrakerインストールの前準備を行う
conda install -c anaconda perl
conda install -c anaconda biopython
conda install -c bioconda perl-app-cpanminus
conda install -c bioconda perl-file-spec
conda install -c bioconda perl-hash-merge
conda install -c bioconda perl-list-util
conda install -c bioconda perl-module-load-conditional
conda install -c bioconda perl-posix
conda install -c bioconda perl-file-homedir
conda install -c bioconda perl-parallel-forkmanager
conda install -c bioconda perl-scalar-util-numeric
conda install -c bioconda perl-yaml
conda install -c bioconda perl-class-data-inheritable
conda install -c bioconda perl-exception-class
conda install -c bioconda perl-test-pod
# なんか変な出力だったけど、多分うまくいってる
conda install -c bioconda perl-file-which # skip if you are not comparing to reference annotation
(braker) kosukesano@at137:~/tools$ conda install -c bioconda perl-file-which # skip if you are not comparing to reference annotation
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 4.9.2
latest version: 24.4.0
Please update conda by running
$ conda update -n base -c defaults conda
# All requested packages already installed.
(braker) kosukesano@at137:~/tools$
conda install -c bioconda perl-mce
conda install -c bioconda perl-threaded
conda install -c bioconda perl-list-util
(braker) kosukesano@at137:~/tools$ conda install -c bioconda perl-list-util
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 4.9.2
latest version: 24.4.0
Please update conda by running
$ conda update -n base -c defaults conda
# All requested packages already installed.
(braker) kosukesano@at137:~/tools$
conda install -c bioconda perl-math-utils
conda install -c bioconda cdbtools
conda install -c eumetsat perl-yaml-xs
conda install -c bioconda perl-data-dumperperlモジュールのインストール
#いけたやつ
cpanm Hash::Merge
cpanm List::Util
cpanm MCE::Mutex
cpanm Module::Load::Conditional
cpanm Parallel::Forkcpanm
cpanm Scalar::Util::Numeric
cpanm YAML
cpanm Math::Utils
cpanm File::HomeDir
cpanm Thread::Queue
#いけなかったやつ
cpanm File::Spec::Functions
cpanm YAML::XS
cpanm Data::Dumper
cpanm threads
#skip?
(braker) kosukesano@at137:~/tools$ cpanm POSIX
skipping R/RJ/RJBS/perl-5.38.0.tar.gzいけなかったやつについて、x86_64-conda_cos6-linux-gnu-gccがないことが原因らしい。それを入れてみる。
conda install anaconda::gcc_linux-64
# インストールできた改めてperlのモジュールをインストールしてみる
(braker) kosukesano@at137:~/tools$ cpanm File::Spec::Functions
--> Working on File::Spec::Functions
Fetching http://www.cpan.org/authors/id/X/XS/XSAWYERX/PathTools-3.75.tar.gz ... OK
Configuring PathTools-3.75 ... OK
Building and testing PathTools-3.75 ... FAIL
! Installing File::Spec::Functions failed. See /home/kosukesano/.cpanm/work/1715072449.2386863/build.log for details. Retry with --force to force install it.
(braker) kosukesano@at137:~/tools$0510
BRAKER本体のインストール
~/tool直下にbraker_git_installというディレクトリを作成し、そこでgit cloneを実行。
(braker) kosukesano@at139:~/tools$ mkdir braker_git_install
(braker) kosukesano@at139:~/tools$ cd braker_git_install/
(braker) kosukesano@at139:~/tools/braker_git_install$ git clone https://github.com/Gaius-Augustus/BRAKER.git
Cloning into 'BRAKER'...
remote: Enumerating objects: 7324, done.
remote: Counting objects: 100% (1666/1666), done.
remote: Compressing objects: 100% (660/660), done.
remote: Total 7324 (delta 1072), reused 1530 (delta 983), pack-reused 5658
Receiving objects: 100% (7324/7324), 123.32 MiB | 20.53 MiB/s, done.
Resolving deltas: 100% (5423/5423), done.
Updating files: 100% (152/152), done.
(braker) kosukesano@at139:~/tools/braker_git_install$ ls
BRAKER
(braker) kosukesano@at139:~/tools/braker_git_install$BRAKERの内部で動くソフトのインストール
それぞれ~/tool直下にディレクトリを作成し、git cloneでインストールした。
git clone https://github.com/gatech-genemark/ProtHint.git
git clone https://github.com/Gaius-Augustus/TSEBRA.git
git clone https://github.com/gatech-genemark/GeneMark-ETP.gitプロテインデータベースのダウンロード
ここからAthropodaのファイルをローカルでダウンロード。遺伝研に移動しgunzipで解凍した。
# ローカルで実行
scp ~/Downloads/Arthropoda.fa.gz kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools
# 遺伝研で実行
gunzip Arthropoda.fa.gz BRAKERの内部で動くソフトへのパス開通
BRAKER本体や、GeneMark-ETPなどにパスを通す。 なお、GeneMark-ETPについては、GeneMark-ETP/binだけでなく、その下のGeneMark-ETP/bin/gmesやGeneMark-ETP/bin/gmstを個別に用いることがあり、それぞれ別にパスを通す。
パスは全てbraker.profileに追記し、braker.profileをsourceすることでパスも一緒に通るようにした。
# braker.profileの追記内容
#################################
export PATH="~/tools/braker_git_install/BRAKER/scripts:$PATH"
export PATH="~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin:$PATH"
export PATH="~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes:$PATH"
export PATH="~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmst:$PATH"
export PATH="~/tools/ProtHint_git_install/ProtHint/bin:$PATH"
export PATH="~/tools/TSEBRA_git_install/TSEBRA/bin:$PATH"マダラのゲノムとタンパク質データベースを用いた BRAKERのテストラン
/home/kosukesano/tools/にfor_brakertestというディレクトリを作成。その中でbrakertest.shを作成。
# brakertest.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=24G
#$ -l mem_req=24G
echo start at
date
source ~/tools/pyenv_env/braker_profile
braker.pl --genome=~/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=~/tools/Arthropoda.fa --threads=2
dateジョブを投げて結果を待つ。
0514
brakertest.shの結果
# brakertest.sh.e26009591の一部
#*********
# WARNING: /lustre7/home/kosukesano/../config is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /lustre7/home/kosukesano/../config!
#*********
# Fri May 10 16:11:59 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
#*********
# WARNING: /usr/share/augustus/config is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
#*********
# Fri May 10 16:11:59 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$AUGUSTUS_CONFIG_PATH not set!AUGUSTUS_CONFIG_PATHが通っていないというエラー。
解決策
# .bash_profileに以下を追加
export AUGUSTUS_CONFIG_PATH="/usr/share/augustus/config/"その後もパスが通っていないことに関するエラーが多発。
### GENEMARK_PATHが通らないエラー ############################################
# Tue May 14 15:26:11 2024: Found environment variable $GENEMARK_PATH.
# Tue May 14 15:26:11 2024: Checking ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes as potential path for $GENEMARK_PATH.
#*********
# WARNING: ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes is not a directory. Will not set $GENEMARK_PATH to ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes!
#*********
# Tue May 14 15:26:11 2024: Checking ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes/gmes/ as potential path for $GENEMARK_PATH.
#*********
# WARNING: ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes/gmes/ is not a directory. Will not set $GENEMARK_PATH to ~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes/gmes/!
#*********
# Tue May 14 15:26:11 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$GENEMARK_PATH not set!
There are 3 alternative ways to set GENEMARK_PATH for
braker.pl:
a) provide command-line argument --GENEMARK_PATH=/your/path
b) use an existing environment variable $GENEMARK_PATH
for setting the environment variable, run
export GENEMARK_PATH=/your/path
in your shell. You may append this to your .bashrc or
.profile file in order to make the variable available to
all your bash sessions.
c) braker.pl can try guessing the location of
GENEMARK_PATH from the location of gmes_petap.pl
executable if it is available in your $PATH variable.
If you try to rely on this option, you can check by
typing
which gmes_petap.pl
in your shell, whether the executable is in your $PATH
Tue May 14 15:26:11 JST 2024
### PROTHINT_PATHが通らないエラー ############################################
# Tue May 14 15:27:58 2024: Trying to set $PROTHINT_PATH...
# Tue May 14 15:27:58 2024: Found environment variable $PROTHINT_PATH.
# Tue May 14 15:27:58 2024: Checking ~/tools/ProtHint_git_install/ProtHint/bin/ as potential path for $PROTHINT_PATH.
#*********
# WARNING: ~/tools/ProtHint_git_install/ProtHint/bin/ is not a directory. Will not set $PROTHINT_PATH to ~/tools/ProtHint_git_install/ProtHint/bin/!
#*********
# Tue May 14 15:27:58 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$PROTHINT_PATH not set!
There are 3 alternative ways to set PROTHINT_PATH for
braker.pl:
a) provide command-line argument --PROTHINT_PATH=/your/path
b) use an existing environment variable $PROTHINT_PATH
for setting the environment variable, run
export PROTHINT_PATH=/your/path
in your shell. You may append this to your .bashrc or
.profile file in order to make the variable available to
all your bash sessions.
c) braker.pl can try guessing the location of
PROTHINT_PATH from the location of prothint.py
executable if it is available in your $PATH variable.
If you try to rely on this option, you can check by
typing
which prothint.py
in your shell, whether the executable is in your $PATH
Tue May 14 15:27:58 JST 2024結論として以下の内容をbraker_profileに追記した。
# ~/tools/pyenv_envにあるbraker_profileへの追記内容。
#################################
export AUGUSTUS_CONFIG_PATH="/usr/share/augustus/config/"
export GENEMARK_PATH=~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes
export PROTHINT_PATH=~/tools/ProtHint_git_install/ProtHint/bin
export TSEBRA_PATH=~/tools/TSEBRA_git_install/TSEBRA/binAUGUSTUS_CONFIG_PATH="/usr/share/augustus/config/"について- パスの最後に
/をつける。 "の有無が与える影響は不明。これだとうまくいってる。
- パスの最後に
GENEMARK_PATH=~/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmesについて- パスの最後には
/をつけてはいけない。 "をつけてはいけない。GENEMARK_PATHだが実際にはGeneMark-ETP/bin/gmes下にあるgmes_petap.plを参照しているため、/gmesまでパスを通す。
- パスの最後には
PROTHINT_PATH=~/tools/ProtHint_git_install/ProtHint/binについて- パスの最後には
/をつけてはいけない。 "をつけてはいけない。
- パスの最後には
TSEBRA_PATH=~/tools/TSEBRA_git_install/TSEBRA/binについて- パス最後の
/の有無が与える影響は不明。これだとうまくいってる。 "の有無が与える影響は不明。これだとうまくいってる。
このような処理を行なった後、brakertest.shを(時短のため)ジョブではなく自分の作業ノードで実行。
kosukesano@at137:~/tools/for_brakertest$ bash brakertest.sh
start at
Tue May 14 15:51:40 JST 2024
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=~/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=~/tools/Arthropoda.fa --threads=2
# Tue May 14 15:51:43 2024: braker.pl version 3.0.8
# Tue May 14 15:51:43 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 14 15:51:43 2024: Configuring of BRAKER for using external tools...
# Tue May 14 15:51:43 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Tue May 14 15:51:43 2024: Checking /usr/share/augustus/config/ as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config/!
# Tue May 14 15:51:43 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config//species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=~/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=~/tools/Arthropoda.fa --threads=2
# Tue May 14 15:51:43 2024: braker.pl version 3.0.8
# Tue May 14 15:51:43 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 14 15:51:43 2024: Configuring of BRAKER for using external tools...
# Tue May 14 15:51:43 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Tue May 14 15:51:43 2024: Checking /usr/share/augustus/config/ as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config/!
# Tue May 14 15:51:43 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config//species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
# Tue May 14 15:51:43 2024: Trying to set $AUGUSTUS_BIN_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $AUGUSTUS_BIN_PATH.
# Tue May 14 15:51:43 2024: Trying to guess AUGUSTUS_BIN_PATH from location of augustus executable that is available in your $PATH
# Tue May 14 15:51:43 2024: Checking /usr/bin as potential path for $AUGUSTUS_BIN_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $AUGUSTUS_BIN_PATH to /usr/bin!
# Tue May 14 15:51:43 2024: Trying to set $AUGUSTUS_SCRIPTS_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $AUGUSTUS_SCRIPTS_PATH.
# Tue May 14 15:51:43 2024: Checking /usr/share/augustus/config//../scripts as potential path for $AUGUSTUS_SCRIPTS_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $AUGUSTUS_SCRIPTS_PATH to /usr/share/augustus/config//../scripts!
# Tue May 14 15:51:43 2024: WARNING: BRAKER will copy the
AUGUSTUS_CONFIG folder into your home directory!
# Tue May 14 15:51:43 2024: WARNING: $AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config//species ) is not writeable.
*** IMPORTANT: Resetting $AUGUSTUS_CONFIG_PATH=/home/kosukesano/.augustus because BRAKER requires a writable location!
# Tue May 14 15:51:43 2024: Trying to set $PYTHON3_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $PYTHON3_PATH.
# Tue May 14 15:51:43 2024: Trying to guess PYTHON3_PATH from location of python3 executable that is available in your $PATH
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin as potential path for $PYTHON3_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $PYTHON3_PATH to /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin!
# Tue May 14 15:51:43 2024: Trying to set $GENEMARK_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $GENEMARK_PATH.
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes as potential path for $GENEMARK_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $GENEMARK_PATH to /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes!
# Tue May 14 15:51:43 2024: Trying to set $DIAMOND_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $DIAMOND_PATH.
# Tue May 14 15:51:43 2024: Trying to guess DIAMOND_PATH from location of diamond executable that is available in your $PATH
# Tue May 14 15:51:43 2024: Checking /usr/bin as potential path for $DIAMOND_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $DIAMOND_PATH to /usr/bin!
# Tue May 14 15:51:43 2024: Trying to set $PROTHINT_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $PROTHINT_PATH.
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/tools/ProtHint_git_install/ProtHint/bin as potential path for $PROTHINT_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $PROTHINT_PATH to /home/kosukesano/tools/ProtHint_git_install/ProtHint/bin!
# Tue May 14 15:51:43 2024: Trying to set $TSEBRA_PATH...
# Tue May 14 15:51:43 2024: Found environment variable $TSEBRA_PATH.
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin as potential path for $TSEBRA_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $TSEBRA_PATH to /home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin!
# Tue May 14 15:51:43 2024: Trying to set $CDBTOOLS_PATH...
# Tue May 14 15:51:43 2024: Did not find environment variable $CDBTOOLS_PATH.
# Tue May 14 15:51:43 2024: Trying to guess CDBTOOLS_PATH from location of cdbfasta executable that is available in your $PATH
# Tue May 14 15:51:43 2024: Checking /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin as potential path for $CDBTOOLS_PATH.
# Tue May 14 15:51:43 2024: Success! Setting $CDBTOOLS_PATH to /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin!
# Tue May 14 15:51:45 2024: BRAKER will execute GeneMark-EP for training GeneMark and generating a training gene set for AUGUSTUS, using protein information as sole extrinsic evidence source.
#*********
# IMPORTANT INFORMATION: no species for identifying the AUGUSTUS parameter set that will arise from this BRAKER run was set. BRAKER will create an AUGUSTUS parameter set with name Sp_1. This parameter set can be used for future BRAKER/AUGUSTUS prediction runs for the same species. It is usually not necessary to retrain AUGUSTUS with novel extrinsic data if a high quality parameter set already exists.
#*********
# Tue May 14 15:51:45 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 3633
protein sequence file /lustre7/home/kosukesano/tools/for_brakertest/~/tools/Arthropoda.fa does not exist.
Tue May 14 15:51:45 JST 2024
kosukesano@at137:~/tools/for_brakertest$ protein sequence file does not exist. ???
0515
カレントディレクトリでbash brakertest.shを実行するとBRAKERが走る?
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=1G
#$ -l mem_req=1G
echo start at
date
source ~/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2
dateArthropoda.fa does not exist.というエラーは入力ファイルを絶対指定していない事が原因らしい。絶対指定したところエラーがなくなった。
実行後にはbrakerというディレクトリが~/tools/for_brakertest下にできる。しかしこれが邪魔するのか、もう一度実行しようとすると以下のエラーが生じる。
kosukesano@at137:~/tools/for_brakertest$ bash brakertest.sh
start at
Wed May 15 15:33:32 JST 2024
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2
# Wed May 15 15:33:33 2024: braker.pl version 3.0.8
# Wed May 15 15:33:33 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Wed May 15 15:33:33 2024: Configuring of BRAKER for using external tools...
# Wed May 15 15:33:33 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Wed May 15 15:33:33 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Wed May 15 15:33:33 2024: Checking /usr/share/augustus/config/ as potential path for $AUGUSTUS_CONFIG_PATH.
# Wed May 15 15:33:33 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config/!
# Wed May 15 15:33:33 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config//species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
# Wed May 15 15:33:33 2024: Log information is stored in file /lustre7/home/kosukesano/tools/for_brakertest/braker/braker.log
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1240
Failed to create direcotry /lustre7/home/kosukesano/tools/for_brakertest/braker/GeneMark-ES!
Wed May 15 15:33:33 JST 2024
kosukesano@at137:~/tools/for_brakertest$同じファイルをジョブとして投げるとAUGUSTUSのパスが通らない
Use of uninitialized value in subroutine entry at /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl line 1920.
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin --
# Wed May 15 15:26:19 2024: braker.pl version 3.0.8
# Wed May 15 15:26:19 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Wed May 15 15:26:19 2024: Configuring of BRAKER for using external tools...
# Wed May 15 15:26:19 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Wed May 15 15:26:19 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Wed May 15 15:26:19 2024: Checking /usr/share/augustus/config/ as potential path for $AUGUSTUS_CONFIG_PATH.
#*********
# WARNING: /usr/share/augustus/config/ is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config/!
#*********
# Wed May 15 15:26:19 2024: Checking /lustre7/home/kosukesano/tools/../config as potential path for $AUGUSTUS_CONFIG_PATH.
#*********
# WARNING: /lustre7/home/kosukesano/tools/../config is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /lustre7/home/kosukesano/tools/../config!
#*********
# Wed May 15 15:26:19 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
#*********
# WARNING: /usr/share/augustus/config is not a directory. Will not set $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
#*********
# Wed May 15 15:26:19 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$AUGUSTUS_CONFIG_PATH not set!
There are 3 alternative ways to set this variable for braker.pl:
a) provide command-line argument --AUGUSTUS_CONFIG_PATH=/your/path
b) use an existing environment variable $AUGUSTUS_CONFIG_PATH
for setting the environment variable, run
export AUGUSTUS_CONFIG_PATH=/your/path
in your shell. You may append this to your .bashrc or
.profile file in order to make the variable available to all
your bash sessions.
c) braker.pl can try guessing the location of
$AUGUSTUS_CONFIG_PATH from an augustus executable that is
available in your $PATH variable.
If you try to rely on this option, you can check by typing
which augustus
in your shell, whether there is an augustus executable in
your $PATH
Be aware: the $AUGUSTUS_CONFIG_PATH must be writable for
braker.pl because braker.pl is a pipeline that
optimizes parameters that reside in that
directory. This might be problematic in case you
are using a system-wide installed augustus
installation that resides in a directory that is
not writable to you as a user.前に見たエラーと同じ……。
パスが認識されていない?
0520
pyenv下でmambaforge環境を作成
遺伝研の作業ノードで実行した。
kosukesano@at138:~/pyenv_conda_environment$ pyenv install mambaforge-22.9.0-3
Downloading Mambaforge-22.9.0-3-Linux-x86_64.sh.sh...
-> https://github.com/conda-forge/miniforge/releases/download/22.9.0-3/Mambaforge-22.9.0-3-Linux-x86_64.sh
Installing Mambaforge-22.9.0-3-Linux-x86_64.sh...
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 22.9.0
latest version: 24.5.0
Please update conda by running
$ conda update -n base -c conda-forge conda
## Package Plan ##
environment location: /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3
added / updated specs:
- conda=22.9.0
- pip
The following packages will be downloaded:
package | build
---------------------------|-----------------
ca-certificates-2024.2.2 | hbcca054_0 152 KB conda-forge
certifi-2024.2.2 | pyhd8ed1ab_0 157 KB conda-forge
openssl-3.3.0 | hd590300_0 2.8 MB conda-forge
pip-24.0 | pyhd8ed1ab_0 1.3 MB conda-forge
------------------------------------------------------------
Total: 4.4 MB
The following packages will be UPDATED:
ca-certificates 2022.12.7-ha878542_0 --> 2024.2.2-hbcca054_0 None
certifi 2022.12.7-pyhd8ed1ab_0 --> 2024.2.2-pyhd8ed1ab_0 None
openssl 3.0.7-h0b41bf4_1 --> 3.3.0-hd590300_0 None
pip 22.3.1-pyhd8ed1ab_0 --> 24.0-pyhd8ed1ab_0 None
Downloading and Extracting Packages
pip-24.0 | 1.3 MB | #################################################################################################################################################### | 100%
openssl-3.3.0 | 2.8 MB | #################################################################################################################################################### | 100%
certifi-2024.2.2 | 157 KB | #################################################################################################################################################### | 100%
ca-certificates-2024 | 152 KB | #################################################################################################################################################### | 100%
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Retrieving notices: ...working... done
Installed Mambaforge-22.9.0-3-Linux-x86_64.sh to /home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3
kosukesano@at138:~/pyenv_conda_environment$EDTA環境の構築
~/tools/pyenv_env下にEDTA_profileを作成
### EDTA_profileの中身
source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
. "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
else
export PATH="/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
fi
fi
unset __conda_setup
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
. "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<
conda activate EDTA2(これをsourceすればEDTAが動く)
上記シェルスクリプトをsourceしてmambaforge環境を立ち上げたのち、以下のコマンドでEDTAをインストール、EDTA2という環境を構築した
git clone https://github.com/oushujun/EDTA.git
cd EDTA
mamba env create -f EDTA_2.2.x.yml
conda activate EDTA2最新のマダラゲノムを用いたsoftmaskの復習
### 最新のマダラゲノムを遺伝研に転送
scp /Volumes/Elements_1/240514_new_weebil_genome/231117_madaragenome_fasta kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano
### nama_dataというディレクトリにそれを格納、ついでに拡張子を.fastaにしておく
(EDTA2) kosukesano@at138:~$ mv 231117_madaragenome_fasta ~/tools/for_softmask/nama_data
(EDTA2) kosukesano@at138:~$ cd ~/tools/for_softmask/nama_data
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ ls
231117_madaragenome_fasta
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ less 231117_madaragenome_fasta
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ mv 231117_madaragenome_fasta ~/tools/for_softmask/nama_data/231117_madaragenome.fasta
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$
### 下準備、BLAST_DATABASE_PREFIXという名前で、参照データベースを作成(作業ノードで実行)
(EDTA2) kosukesano@at138:~/tools/for_softmask$ BuildDatabase -name BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta
Building database BLAST_DATABASE_PREFIX:
Reading /home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta...
Number of sequences (bp) added to database: 209 ( 1295393365 bp )
(EDTA2) kosukesano@at138:~/tools/for_softmask$RepeatModelerの実行
### RepeatModeler_test.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database BLAST_DATABASE_PREFIX -pa 12
date0521
RepeatModeler_test.shの結果
### RepeatModeler_test.sh.e26118406 (エラーメッセージ)
ERROR from search engine (0)
### RM_78034.MonMay201654522024(出力ファイル)
(EDTA2) kosukesano@at138:~/tools/for_softmask$ ls RM_78034.MonMay201654522024/
consensi.fa families.stk round-1 round-2
(EDTA2) kosukesano@at138:~/tools/for_softmask$メモリが足りなかった?とりあえずメモリを48にしてもう一度qsub_beta
/usr/share/augustus/configを自身のホームディレクトリに再帰的にコピー
cp -r /usr/share/config ~/tools/AUGUSTUS_CONFIG_copy
(braker) kosukesano@at138:~/tools/AUGUSTUS_CONFIG_copy/config$ ls
cgp extrinsic model profile species
(braker) kosukesano@at138:~/tools/AUGUSTUS_CONFIG_copy/config$ ここをパスに指定してqsub
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=1G
#$ -l mem_req=1G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2\
--AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
date結果
Use of uninitialized value in subroutine entry at /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl line 1920.
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropod
a.fa --threads=2 --AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH
=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Tue May 21 13:04:42 2024: braker.pl version 3.0.8
# Tue May 21 13:04:42 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 21 13:04:42 2024: Configuring of BRAKER for using external tools...
# Tue May 21 13:04:42 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 21 13:04:42 2024: Found command line argument $AUGUSTUS_CONFIG_PATH.
# Tue May 21 13:04:42 2024: Checking /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 21 13:04:42 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config!
# Tue May 21 13:04:42 2024: Trying to set $AUGUSTUS_BIN_PATH...
# Tue May 21 13:04:42 2024: Found environment variable $AUGUSTUS_BIN_PATH.
# Tue May 21 13:04:42 2024: Checking /usr/bin as potential path for $AUGUSTUS_BIN_PATH.
#*********
# WARNING: Couldn't find augustus in /usr/bin. Will not set $AUGUSTUS_BIN_PATH to /usr/bin!
#*********
# Tue May 21 13:04:42 2024: Checking /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config/../bin as potential path for $AUGUSTUS_BIN_PATH.
#*********
# WARNING: /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config/../bin is not a directory. Will not set $AUGUSTUS_BIN_PATH to /home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config/../bin!
#*********
# Tue May 21 13:04:42 2024: Checking /usr/share/augustus/bin as potential path for $AUGUSTUS_BIN_PATH.
#*********
# WARNING: /usr/share/augustus/bin is not a directory. Will not set $AUGUSTUS_BIN_PATH to /usr/share/augustus/bin!
#*********
# Tue May 21 13:04:42 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$AUGUSTUS_BIN_PATH not set!
There are 3 alternative ways to set this variable for
braker.pl:
a) provide command-line argument
--AUGUSTUS_BIN_PATH=/your/path
b) use an existing environment variable $AUGUSTUS_BIN_PATH
for setting the environment variable, run
export AUGUSTUS_BIN_PATH=/your/path
in your shell. You may append this to your .bashrc or
.profile file in order to make the variable available to
all your bash sessions.
c) braker.pl can try guessing the location of
$AUGUSTUS_BIN_PATH from the location of
$AUGUSTUS_CONFIG_PATH (in this case
/home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config/../binaugustus関係のファイルを全部自分のディレクトリにコピー
~/tools/All_AUGUSTUS_testというディレクトリを作成し、その下に/usr/share/augustusを全てコピーした。また、その下に/binディレクトリを作成し、そこに`/usr/bin/augustus/をコピーした。
また、DIAMONDSというツールが要求されたので、~/tools/DIAMOND_git_installを作成し、その下にgitでインストールした。
(braker) kosukesano@at138:~/tools$ mkdir DIAMOND_git_install
(braker) kosukesano@at138:~/tools$ cd DIAMOND_git_install/
(braker) kosukesano@at138:~/tools/DIAMOND_git_install$ wget http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
tar xzf diamond-linux64.tar.gz
--2024-05-21 15:30:41-- http://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
Resolving github.com (github.com)... 20.27.177.113
Connecting to github.com (github.com)|20.27.177.113|:80... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz [following]
--2024-05-21 15:30:41-- https://github.com/bbuchfink/diamond/releases/download/v0.9.24/diamond-linux64.tar.gz
Connecting to github.com (github.com)|20.27.177.113|:443... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://objects.githubusercontent.com/github-production-release-asset-2e65be/31987083/313cc780-09dd-11e9-902e-599c1618e37d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240521%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240521T063041Z&X-Amz-Expires=300&X-Amz-Signature=66499e2d5de74c872454dd8ac0770632c87059513935b1208862ed19b28f4121&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=31987083&response-content-disposition=attachment%3B%20filename%3Ddiamond-linux64.tar.gz&response-content-type=application%2Foctet-stream [following]
--2024-05-21 15:30:41-- https://objects.githubusercontent.com/github-production-release-asset-2e65be/31987083/313cc780-09dd-11e9-902e-599c1618e37d?X-Amz-Algorithm=AWS4-HMAC-SHA256&X-Amz-Credential=releaseassetproduction%2F20240521%2Fus-east-1%2Fs3%2Faws4_request&X-Amz-Date=20240521T063041Z&X-Amz-Expires=300&X-Amz-Signature=66499e2d5de74c872454dd8ac0770632c87059513935b1208862ed19b28f4121&X-Amz-SignedHeaders=host&actor_id=0&key_id=0&repo_id=31987083&response-content-disposition=attachment%3B%20filename%3Ddiamond-linux64.tar.gz&response-content-type=application%2Foctet-stream
Resolving objects.githubusercontent.com (objects.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to objects.githubusercontent.com (objects.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 2418573 (2.3M) [application/octet-stream]
Saving to: ‘diamond-linux64.tar.gz’
diamond-linux64.tar.gz 100%[======================================================================================================>] 2.31M --.-KB/s in 0.05s
2024-05-21 15:30:43 (50.0 MB/s) - ‘diamond-linux64.tar.gz’ saved [2418573/2418573]
(braker) kosukesano@at138:~/tools/DIAMOND_git_install$ ls
diamond diamond-linux64.tar.gz diamond_manual.pdf
(braker) kosukesano@at138:~/tools/DIAMOND_git_install$brakertest.shのスクリプトとしてパスを通した
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 2
#$ -l s_vmem=1G
#$ -l mem_req=1G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2\
--AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/augustus/config\
--AUGUSTUS_BIN_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/bin\
--AUGUSTUS_SCRIPTS_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/augustus/scripts\
--DIAMOND_PATH=/home/kosukesano/tools/DIAMOND_git_install\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
date結果
Use of uninitialized value in subroutine entry at /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl line 1920.
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/augustus/config --AUGUSTUS_BIN_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/bin --AUGUSTUS_SCRIPTS_PATH=/home/kosukesano/tools/All_AUGUSTUS_test/augustus/scripts --DIAMOND_PATH=/home/kosukesano/tools/DIAMOND_git_install --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Tue May 21 15:36:14 2024: braker.pl version 3.0.8
# Tue May 21 15:36:14 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 21 15:36:14 2024: Configuring of BRAKER for using external tools...
# Tue May 21 15:36:14 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $AUGUSTUS_CONFIG_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/All_AUGUSTUS_test/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /home/kosukesano/tools/All_AUGUSTUS_test/augustus/config!
# Tue May 21 15:36:14 2024: Trying to set $AUGUSTUS_BIN_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $AUGUSTUS_BIN_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/All_AUGUSTUS_test/bin as potential path for $AUGUSTUS_BIN_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $AUGUSTUS_BIN_PATH to /home/kosukesano/tools/All_AUGUSTUS_test/bin!
# Tue May 21 15:36:14 2024: Trying to set $AUGUSTUS_SCRIPTS_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $AUGUSTUS_SCRIPTS_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/All_AUGUSTUS_test/augustus/scripts as potential path for $AUGUSTUS_SCRIPTS_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $AUGUSTUS_SCRIPTS_PATH to /home/kosukesano/tools/All_AUGUSTUS_test/augustus/scripts!
# Tue May 21 15:36:14 2024: Trying to set $PYTHON3_PATH...
# Tue May 21 15:36:14 2024: Did not find environment variable $PYTHON3_PATH.
# Tue May 21 15:36:14 2024: Trying to guess PYTHON3_PATH from location of python3 executable that is available in your $PATH
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin as potential path for $PYTHON3_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $PYTHON3_PATH to /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin!
# Tue May 21 15:36:14 2024: Trying to set $GENEMARK_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $GENEMARK_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes as potential path for $GENEMARK_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $GENEMARK_PATH to /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes!
# Tue May 21 15:36:14 2024: Trying to set $DIAMOND_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $DIAMOND_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/DIAMOND_git_install as potential path for $DIAMOND_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $DIAMOND_PATH to /home/kosukesano/tools/DIAMOND_git_install!
# Tue May 21 15:36:14 2024: Trying to set $PROTHINT_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $PROTHINT_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/ProtHint_git_install/ProtHint/bin as potential path for $PROTHINT_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $PROTHINT_PATH to /home/kosukesano/tools/ProtHint_git_install/ProtHint/bin!
# Tue May 21 15:36:14 2024: Trying to set $TSEBRA_PATH...
# Tue May 21 15:36:14 2024: Found command line argument $TSEBRA_PATH.
# Tue May 21 15:36:14 2024: Checking /home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin as potential path for $TSEBRA_PATH.
# Tue May 21 15:36:14 2024: Success! Setting $TSEBRA_PATH to /home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin!
# Tue May 21 15:36:14 2024: Trying to set $CDBTOOLS_PATH...
# Tue May 21 15:36:14 2024: Did not find environment variable $CDBTOOLS_PATH.
# Tue May 21 15:36:18 2024: Trying to guess CDBTOOLS_PATH from location of cdbfasta executable that is available in your $PATH
# Tue May 21 15:36:18 2024: Checking /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin as potential path for $CDBTOOLS_PATH.
# Tue May 21 15:36:18 2024: Success! Setting $CDBTOOLS_PATH to /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin!
# Tue May 21 15:36:18 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 2553
/home/kosukesano/tools/All_AUGUSTUS_test/bin/augustus not executable on this machine.~/tools/for_brakertest/share_of_augustusを作成、その下で遺伝研の`augustus`を使うスクリプトを実行
(braker) kosukesano@at138:~/tools/for_brakertest/share_of_augustus$ bash share_brakertest.sh
start at
Tue May 21 16:16:56 JST 2024
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config/ --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Tue May 21 16:16:59 2024: braker.pl version 3.0.8
# Tue May 21 16:16:59 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Tue May 21 16:16:59 2024: Configuring of BRAKER for using external tools...
# Tue May 21 16:16:59 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Tue May 21 16:16:59 2024: Found command line argument $AUGUSTUS_CONFIG_PATH.
# Tue May 21 16:16:59 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Tue May 21 16:16:59 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
# Tue May 21 16:16:59 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config/species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
# Tue May 21 16:17:05 2024: Log information is stored in file /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/braker.log
^C
(braker) kosukesano@at138:~/tools/for_brakertest/share_of_augustus$ ls
braker share_brakertest.sh.e26120966 share_brakertest.sh.o26120966 share_brakertest.sh.pe26120966 share_brakertest.sh.po26120966
share_brakertest.sh share_brakertest.sh.e26120972 share_brakertest.sh.o26120972 share_brakertest.sh.pe26120972 share_brakertest.sh.po26120972
(braker) kosukesano@at138:~/tools/for_brakertest/share_of_augustus$ 0522
gpuノードに投げたらなんか途中まで動いた
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 2
#$ -l s_vmem=1G
#$ -l mem_req=1G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2\
--AUGUSTUS_CONFIG_PATH=/home/kosukesano/tools/AUGUSTUS_CONFIG_copy/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
dateAUGUSTUS_CONFIG_pathを/usr/share/augustus/configに設定した時のエラー
### share_brakertest.sh.e26123608
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Wed May 22 11:12:37 2024: braker.pl version 3.0.8
# Wed May 22 11:12:37 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Wed May 22 11:12:37 2024: Configuring of BRAKER for using external tools...
# Wed May 22 11:12:37 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Wed May 22 11:12:37 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Wed May 22 11:12:37 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Wed May 22 11:12:37 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
# Wed May 22 11:12:37 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config/species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1240
Failed to create direcotry /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/GeneMark-ES!もう一回やったら別のエラーが出た
### share_brakertest.sh.e26123620
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=2 --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Wed May 22 11:14:38 2024: braker.pl version 3.0.8
# Wed May 22 11:14:38 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Wed May 22 11:14:38 2024: Configuring of BRAKER for using external tools...
# Wed May 22 11:14:38 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Wed May 22 11:14:38 2024: Found environment variable $AUGUSTUS_CONFIG_PATH.
# Wed May 22 11:14:38 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Wed May 22 11:14:38 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
# Wed May 22 11:14:38 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config/species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 5942
failed to execute: /lustre7/home/kosukesano/tools/braker_git_install/BRAKER/scripts/get_gc_content.py --sequences /home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --print_sequence_length 1> /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/gc_content.out 2> /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/errors/gc_content.stderr!/home/kosukesano/tools/for_brakertest/share_of_augustus/brakerを消してなかったせい?
試しに自分のディレクトリにコピーしたAUGUSTUS_CONFIG_pathを使ってみる
### share_brakertest.sh.e26123629
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1240
Failed to create direcotry /lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/GeneMark-ES!ちなみに、/lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/errorsを見ると……
(braker) kosukesano@at138:/lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/braker/errors$ ls
find_python3_biopython.err find_python3_re.err gc_content.stderrfind_python3_biopython.err、find_python3_re.errには何も書いていなかった。
gc_content.stderrを見ると……
Traceback (most recent call last):
File "/lustre7/home/kosukesano/tools/braker_git_install/BRAKER/scripts/get_gc_content.py", line 215, in <module>
main()
File "/lustre7/home/kosukesano/tools/braker_git_install/BRAKER/scripts/get_gc_content.py", line 52, in main
text = seq_file.read(int(config['mem_size']))
MemoryError↑のやつ、もしかして/lustre7/home/kosukesano/tools/for_brakertest/share_of_augustus/でやっていたから意味なかった?
改めて自分の/home/kosukesano/.../share_of_augustus/brakerファイルを消し、/usr/share/augustus/configにパスを通して再実行
0523
~/tools/for_brakertest/share_of_augustus/output_testを作成、その下でshare_brakertest.shをqsub。ジョブIDは26124509
(結果の出力ファイルであるbrakerがすでに存在しているとエラーを吐くようなので、ディレクトリを移した)
### share_brakertest.sh の中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
#$ -l s_vmem=16G
#$ -l mem_req=16G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/gall/out.p_ctg.fa.sort.softmasked.fasta --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=16\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
dateジョブ終わっていないけど出力ファイルやエラーファイルができてる。リアルタイムで書き込まれるっぽい?
0524
BRAKER終了!
ID26124509のジョブが終了。~/tools/for_brakertest/share_of_augustus/output_testの下にbrakerというディレクトリができた。
### ディレクトリbrakerの中身
kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test$ ls braker/
Augustus GeneMark-EP GeneMark-ES braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff prothint.gff species what-to-cite.txtこのうちbraker.gtfが最終産物。
### braker.gtf
ptg000001l_length_376496 AUGUSTUS gene 37356 37715 . + . g1
ptg000001l_length_376496 AUGUSTUS transcript 37356 37715 0.7 + . g1.t1
ptg000001l_length_376496 AUGUSTUS start_codon 37356 37358 . + 0 transcript_id "g1.t1"; gene_id "g1";
ptg000001l_length_376496 AUGUSTUS CDS 37356 37715 0.7 + 0 transcript_id "g1.t1"; gene_id "g1";
ptg000001l_length_376496 AUGUSTUS exon 37356 37715 . + . transcript_id "g1.t1"; gene_id "g1";
ptg000001l_length_376496 AUGUSTUS stop_codon 37713 37715 . + 0 transcript_id "g1.t1"; gene_id "g1";
ptg000001l_length_376496 AUGUSTUS gene 77496 78040 . + . g2
ptg000001l_length_376496 AUGUSTUS transcript 77496 78040 0.83 + . g2.t1
ptg000001l_length_376496 AUGUSTUS start_codon 77496 77498 . + 0 transcript_id "g2.t1"; gene_id "g2";
.
.
.
.
.
.
ptg006399l_length_14628 AUGUSTUS transcript 6869 7388 1 - . g44999.t1
ptg006399l_length_14628 AUGUSTUS stop_codon 6869 6871 . - 0 transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS CDS 6869 7221 1 - 2 transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS exon 6869 7221 . - . transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS intron 7222 7354 1 - . transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS CDS 7355 7388 1 - 0 transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS exon 7355 7388 . - . transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS start_codon 7386 7388 . - 0 transcript_id "g44999.t1"; gene_id "g44999";
ptg006399l_length_14628 AUGUSTUS gene 14340 14628 . - . g45000
ptg006399l_length_14628 AUGUSTUS transcript 14340 14628 1 - . g45000.t1
ptg006399l_length_14628 AUGUSTUS stop_codon 14340 14342 . - 0 transcript_id "g45000.t1"; gene_id "g45000";
ptg006399l_length_14628 AUGUSTUS CDS 14340 14628 1 - 1 transcript_id "g45000.t1"; gene_id "g45000";
ptg006399l_length_14628 AUGUSTUS exon 14340 14628 . - . transcript_id "g45000.t1"; gene_id "g45000";
(END)gene idはg45000まで?昆虫のゲノムとしては多い。
/braker/Augustus/以下にはAugustusのみの結果が出力されているみたい。seqkitを用いて遺伝子数などを確認してみる。seqkitは遺伝研のsingularityにあるものを使う。
### /braker/Augustus/の中身
kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test/braker$ ls Augustus/
augustus.hints.aa augustus.hints.codingseq augustus.hints.gtf
### seqkitによる遺伝子数の確認
kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat Augustus/augustus.hints.aa
file format type num_seqs sum_len min_len avg_len max_len
Augustus/augustus.hints.aa FASTA Protein 47,989 16,250,546 7 338.6 25,263シーケンス数47,989、braker単体よりも多い。また最小の長さが7と非常に短く、本来遺伝子ではない部分を余計にアノテーションしている?
新規マダラゲノムのソフトマスク
RepeatModeler_test.shが終了した。結局メモリ数48だと入らず、24にした。うまくいったスクリプトは以下。
### RepeatModeler_test.sh
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database BLAST_DATABASE_PREFIX -pa 6
date出力ファイルRepeatModeler_test.sh.o26123564を見るとこんな感じ
### RepeatModeler_test.sh.o26123564の中身
start at
Wed May 22 10:50:55 JST 2024
RepeatModeler Version 2.0.1
===========================
Search Engine = rmblast 2.14.1+
Dependencies: TRF 4.09, RECON , RepeatScout 1.0.6, RepeatMasker 4.1.2
LTR Structural Analysis: Disabled [use -LTRStruct to enable]
Random Number Seed: 1716342701
Database = BLAST_DATABASE_PREFIX .
- Sequences = 209
- Bases = 1295393365
- N50 = 54629423
- Contig Histogram:
Size(bp) Count
-----------------------------------------------------------------------
150024189-160739203 | [ ]
139309176-150024189 | [ ]
128594163-139309176 |
.
.
.
.
.
The RepeatModeler stockholm file is formatted so that it can
easily be submitted to the Dfam database. Please consider contributing
curated families to this open database and be a part of this growing
community resource. For more information contact help@dfam.org.
Fri May 24 05:00:48 JST 2024
(END)大体2日くらいかかってる。
結果はRM_16988.WedMay221052072024というディレクトリに出力された。
### RM_16988.WedMay221052072024の中身
kosukesano@at137:~/tools/for_softmask$ ls RM_16988.WedMay221052072024/
consensi.fa consensi.fa.classified families-classified.stk families.stk round-1 round-2 round-3 round-4 round-5 round-6 tmpConsensi.faこのうちconsensi.fa.classifiedというファイルが最終産物。これを使って次はRepeatMaskerを動かす。RepeatMasker実行スクリプトは以下の通り
### RepeatMasker実行スクリプトRepeatMasker_test.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatMasker -pa 6 -lib ~/tools/for_softmask/RM_16988.WedMay221052072024/consensi.fa.classified ~/tools/for_softmask/nama_data/231117_madaragenome.fasta
dateホームディレクトリの整理
~/old_environment_until20240430を作り、local、pyenv_conda_environment、tools以外のホームディレクトリ直下ディレクトリをそこに入れた。
### 整理後のホームディレクトリの様子
kosukesano@at137:~$ ls
local old_envilonment_until20240430 pyenv_conda_environment results_sh_eando tools
### old_envilonment_until20240430の中身
kosukesano@at137:~$ ls old_envilonment_until20240430/
EDTA GeMoMa_temp busco_downloads cafetest gall leaf_beetle other_weevil outgroup paml_test ronbun_sp
kosukesano@at137:~$ これによって従来のディレクトリ構造が変化したので注意!
フェモラータゲノムのソフトマスク
ローカルのHDDからフェモのゲノムデータSfem_assembly.fastaを持ってくる
### ローカル環境。scpでSfem_assembly.fastaを遺伝研環境にコピー。
/Volumes/Elements_1/sano/weevil_genome/femo_genome$ ls
2023.11.22.polished.annotated.genome Sfem-1_1.fastq.gz Sfem-1_2.fastq.gz Sfem_assembly.fasta
/Volumes/Elements_1/sano/weevil_genome/femo_genome$ scp Sfem_assembly.fasta kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Sfem_assembly.fasta 100% 481MB 105.8MB/s 00:04
/Volumes/Elements_1/sano/weevil_genome/femo_genome$ ~/tools/for_softmask/nama_dataに格納。
~/tools/for_softmask/下にSfemorata_softmaskディレクトリを構築。その中でフェモラータゲノムのソフトマスクを行う。
~/tools/for_softmask/Sfemorata_softmask下でBLASTデータベースを作成。データベース名はSfem_BLAST_DATABASE_PREFIXとした。
### EDTAの環境を立ち上げる
kosukesano@at137:~/tools/for_softmask/Sfemorata_softmask$ source ~/tools/pyenv_env/EDTA_profile
### BLASTデータベースの構築
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_softmask$ BuildDatabase -name Sfem_BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/Sfem_assembly.fasta
Building database Sfem_BLAST_DATABASE_PREFIX:
Reading /home/kosukesano/tools/for_softmask/nama_data/Sfem_assembly.fasta...
Number of sequences (bp) added to database: 5084 ( 495627753 bp )
### lsで見てみる
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_softmask$ ls
RepeatMasker_test.sh Sfem_BLAST_DATABASE_PREFIX.nhr Sfem_BLAST_DATABASE_PREFIX.njs Sfem_BLAST_DATABASE_PREFIX.nni Sfem_BLAST_DATABASE_PREFIX.nsq
RepeatModeler_test.sh Sfem_BLAST_DATABASE_PREFIX.nin Sfem_BLAST_DATABASE_PREFIX.nnd Sfem_BLAST_DATABASE_PREFIX.nog Sfem_BLAST_DATABASE_PREFIX.translation
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_softmask$ 続いてRepeatModelerを実行する。ジョブスクリプトSfem_RepeatModeler.shは以下の通り。
### Sfem_RepeatModeler.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Sfem_BLAST_DATABASE_PREFIX -pa 6
date0527
新規マダラゲノムのソフトマスク続き
~/tools/for_softmask/nama_data内でMadara_ProcessRepeats.shを作成。中身は以下の通り。
### Madara_ProcessRepeats.shの内容
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
ProcessRepeats -maskSource 231117_madaragenome.fasta -xsmall -gff 231117_madaragenome.fasta.cat.gz
date最初の実行では-maskSourceの前に全角の空白がありエラー?手直しをしてもう一度qsub
ついでにフェモラータのゲノムもソフトマスクをした
### Sfem_ProcessRepeats.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
ProcessRepeats -maskSource Sfem_assembly.fasta -xsmall -gff Sfem_assembly.fasta.cat.gz
date
~BUSCOによるアノテーション後のマダラゲノムデータのクオリティ評価
昔ダウンロードしたODBデータと、singularityにあるBUSCOのツールを使って、BRAKERでアノテーションをつけたマダラのゲノムデータを評価した。
kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test/braker$ singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco -m protein -i braker.aa -o OUTPUT -l ~/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/ -f
INFO: ***** Start a BUSCO v5.1.3 analysis, current time: 05/27/2024 14:03:46 *****
INFO: Configuring BUSCO with local environment
INFO: Mode is proteins
INFO: 'Force' option selected; overwriting previous results directory
INFO: Downloading information on latest versions of BUSCO data...
INFO: Input file is /home/kosukesano/tools/for_brakertest/share_of_augustus/output_test/braker/braker.aa
INFO: Using local lineages directory /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/
INFO: Running BUSCO using lineage dataset (eukaryota, 2024-01-08)
INFO: ***** Run HMMER on gene sequences *****
INFO: Running 1013 job(s) on hmmsearch, starting at 05/27/2024 14:03:50
INFO: [hmmsearch] 102 of 1013 task(s) completed
INFO: [hmmsearch] 203 of 1013 task(s) completed
INFO: [hmmsearch] 304 of 1013 task(s) completed
INFO: [hmmsearch] 406 of 1013 task(s) completed
INFO: [hmmsearch] 507 of 1013 task(s) completed
INFO: [hmmsearch] 608 of 1013 task(s) completed
INFO: [hmmsearch] 710 of 1013 task(s) completed
INFO: [hmmsearch] 811 of 1013 task(s) completed
INFO: [hmmsearch] 912 of 1013 task(s) completed
INFO: [hmmsearch] 1013 of 1013 task(s) completed
INFO:
--------------------------------------------------
|Results from dataset |
--------------------------------------------------
|C:88.3%[S:75.2%,D:13.1%],F:6.7%,M:5.0%,n:1013 |
|895 Complete BUSCOs (C) |
|762 Complete and single-copy BUSCOs (S) |
|133 Complete and duplicated BUSCOs (D) |
|68 Fragmented BUSCOs (F) |
|50 Missing BUSCOs (M) |
|1013 Total BUSCO groups searched |
--------------------------------------------------
INFO: BUSCO analysis done. Total running time: 477 seconds
INFO: Results written in /home/kosukesano/tools/for_brakertest/share_of_augustus/output_test/braker/OUTPUT
INFO: For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
kosukesano@at137:~/tools/for_brakertest/share_of_augustus/output_test/braker$結果:88%
まあまあ?
0528
RNA-seqデータを用いたBRAKERのテストラン
~/tools/for_brakertest/share_of_augustusの下にrnaplus_output_testディレクトリを作成。その下にRNA_brakertest.shとold_madaragenome_softmasked.fasta(旧out.p_ctg.fa.sort.softmasked.fasta)を用意。またchanged_id_test_rnaディレクトリを作成。
### RNA_brakertest.shの中身
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
#$ -l s_vmem=16G
#$ -l mem_req=16G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_brakertest/share_of_augustus/rnaplus_output_test/old_madaragenome_softmasked.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=adult-1_1,adult-1_2,adult-2_1,adult-2_2,adult-3_1,adult-3_2 \
--rnaseq_sets_dir=/home/kosukesano/tools/for_brakertest/share_of_augustus/rnaplus_output_test/changed_id_test_rna\
--threads=16\
--species=Smadaranus\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
dateまたchanged_id_test_rnaディレクトリを作成。
(braker) kosukesano@at137:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test$ ls changed_id_test_rna/
adult-1_1.fastq adult-1_2.fastq adult-2_1.fastq adult-2_2.fastq adult-3_1.fastq adult-3_2.fastq
(braker) kosukesano@at137:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test$最初に以下のエラーが出た.
### RNA_brakertest.sh.e26144319の一部抜粋
# Tue May 28 11:37:09 2024: Trying to set $GENEMARK_PATH...
# Tue May 28 11:37:09 2024: Found command line argument $GENEMARK_PATH.
# Tue May 28 11:37:09 2024: Checking /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes as potential path for $GENEMARK_PATH.
#*********
# WARNING: Couldn't find gmetp.pl in /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes. Will not set $GENEMARK_PATH to /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes!
#*********
# Tue May 28 11:37:09 2024: Did not find environment variable $GENEMARK_PATH.
# Tue May 28 11:37:09 2024: ERROR: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1834
$GENEMARK_PATH not set!/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/binを見てみるとgmetp.plはあったが、コマンド上ではGeneMark-ETP/bin/gmesを指定していたのでエラーが起きたっぽい。
オプションにはGeneMark-ETP/bin/gmesを直接指定するものは無い。とはいえGeneMark-ETP/bin/gmesも重要だったはず。よってコマンド上では上記の/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/binを指定するようにし、braker_profileのほうでそれぞれのパスを通すように変更。
### braker_profileの中身。GeneMark-ETP関連のPATHを開通させた。
#################################
export PATH="/home/kosukesano/tools/braker_git_install/BRAKER/scripts:$PATH"
export PATH="/usr/share/augustus/config:$PATH"
#export PATH="~/tools/All_AUGUSTUS_test/augustus/config:$PATH"
export PATH="/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin:$PATH"
export PATH="/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes:$PATH"
export PATH="/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmst:$PATH"
#export PATH="/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin:$PATH"
#export PATH="/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin:$PATH"
#################################うまく動いてそう。
マダラゲノム、フェモゲノムのソフトマスクの結果
- マダラゲノム:
231117_madaragenome.fasta.masked - フェモゲノム:
Sfem_assembly.fasta.masked
どちらもソフトマスクまで完了!
BRAKERの本番ラン準備
~/tools/for_braker/nama_dataを作成。そこに上記のマダラゲノムとフェモゲノムをコピー。また名前をそれぞれ231117_Madara_softmasked.fastaとSfem_softmasked.fastaに変更。またMadara_RNAseqとSfemo_RNAseqディレクトリを作成し、その下にRNAデータをコピー。
0529
ソフトマスク後のフェモラータゲノムのBUSCO値
### BUSCO_OUTPUT_FEMO/short_summary.specific..BUSCO_OUTPUT_FEMO.txtの中身
# BUSCO version is: 5.1.3
# The lineage dataset is: (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_softmask/nama_data/Sfem_assembly.fasta.masked
# BUSCO was run in mode: genome
# Gene predictor used: metaeuk
***** Results: *****
C:98.9%[S:97.8%,D:1.1%],F:0.4%,M:0.7%,n:1013
1002 Complete BUSCOs (C)
991 Complete and single-copy BUSCOs (S)
11 Complete and duplicated BUSCOs (D)
4 Fragmented BUSCOs (F)
7 Missing BUSCOs (M)
1013 Total BUSCO groups searched
Dependencies and versions:
hmmsearch: 3.1
metaeuk: 4.a0f584d98%、非常に高い
BRAKERの本番ラン
### madara_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
#$ -l s_vmem=16G
#$ -l mem_req=16G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/231117_Madara_softmasked.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=adult-1_1,adult-1_2,adult-2_1,adult-2_2,adult-3_1,adult-3_2,\
body-1_1,body-1_2,body-2_1,body-2_2,body-3_1,body-3_2,\
large-larva-1_1,large-larva-1_2,large-larva-2_1,large-larva-2_2,large-larva-3_1,large-larva-3_2,\
middle-larva-1_1,middle-larva-1_2,middle-larva-2_1,middle-larva-2_2,middle-larva-3_1,middle-larva-3_2,\
ovary-1_1,ovary-1_2,ovary-2_1,ovary-2_2,ovary-3_1,ovary-3_2 \
--rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Madara_RNAseq\
--threads=16\
--species=Smadaranus\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
date0531
RNA-seqデータを用いたBRAKERのテストラン結果
braker.gtfが出力され、最後まで動作した。
### 出力ディレクトリbrakerの中身
kosukesano@at139:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test$ ls braker/
Augustus GeneMark-ETP braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff species what-to-cite.txt
kosukesano@at139:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test$ cd braker/
### augustus.hints.aaの要約
kosukesano@at139:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat Augustus/augustus.hints.aa
file format type num_seqs sum_len min_len avg_len max_len
Augustus/augustus.hints.aa FASTA Protein 33,364 13,441,582 10 402.9 27,212
kosukesano@at139:~/tools/for_brakertest/share_of_augustus/rnaplus_output_test/braker$Augustusだけだと遺伝子は3万くらい
ptg006399l_length_14628 AUGUSTUS stop_codon 6869 6871 . - 0 transcript_id "g13066.t1"; gene_id "g13066";
ptg006399l_length_14628 AUGUSTUS CDS 6869 7171 1 - 0 transcript_id "g13066.t1"; gene_id "g13066";
ptg006399l_length_14628 AUGUSTUS exon 6869 7171 . - . transcript_id "g13066.t1"; gene_id "g13066";
ptg006399l_length_14628 AUGUSTUS start_codon 7169 7171 . - 0 transcript_id "g13066.t1"; gene_id "g13066";
ptg006399l_length_14628 AUGUSTUS gene 14340 14555 . - . g13067
ptg006399l_length_14628 AUGUSTUS transcript 14340 14555 1 - . g13067.t1
ptg006399l_length_14628 AUGUSTUS stop_codon 14340 14342 . - 0 transcript_id "g13067.t1"; gene_id "g13067";
ptg006399l_length_14628 AUGUSTUS CDS 14340 14555 1 - 0 transcript_id "g13067.t1"; gene_id "g13067";
ptg006399l_length_14628 AUGUSTUS exon 14340 14555 . - . transcript_id "g13067.t1"; gene_id "g13067";
ptg006399l_length_14628 AUGUSTUS start_codon 14553 14555 . - 0 transcript_id "g13067.t1"; gene_id "g13067";braker全体では13067個の遺伝子が取れた。 #### 出力ファイルの一つであるbraker.aaをbuscoにかけてみた。 以下のスクリプトを用いてbuscoを実行
### buscoにかけるジョブスクリプトMadara_busco.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 12
echo start at
date
date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
-m protein\
-i braker.aa\
-o BUSCO_OUTPUT_MADARA_ANNOTATED_AdultRNA\
-l\
/home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
-f
~結果
# BUSCO version is: 5.1.3
# The lineage dataset is: (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_brakertest/share_of_augustus/rnaplus_output_test/braker/braker.aa
# BUSCO was run in mode: proteins
***** Results: *****
C:91.8%[S:77.1%,D:14.7%],F:2.5%,M:5.7%,n:1013
930 Complete BUSCOs (C)
781 Complete and single-copy BUSCOs (S)
149 Complete and duplicated BUSCOs (D)
25 Fragmented BUSCOs (F)
58 Missing BUSCOs (M)
1013 Total BUSCO groups searched
Dependencies and versions:
hmmsearch: 3.191.8%!?めっちゃ高いじゃん!
BRAKERの本番ランについて
投げるノードをintelに変更。より入りやすく。
また、RNA-seqデータがない時のBUSCO値を見たかったので、OnlyProtein_madaraとOnlyProtein_femoというディレクトリを作り、プロテインデータだけ渡してbrakerを実行させた。
旧型マダラゲノムのBUSCO
旧マダラゲノムのBUSCO値がわからなくなったので、改めてBUSCOにかける。
2024年6月
0603
結果まとめ
- 新規マダラゲノム
- タンパク質リファレンス
BRAKERBUSCO値:89.8%- 遺伝子数:41802
- タンパク質+RNAseqリファレンス
BRAKERBUSCO値:96.8%- 遺伝子数:13653
- タンパク質リファレンス
- 旧マダラゲノム
- タンパク質リファレンス
BRAKERBUSCO値:88.3%- 遺伝子数:45000
- タンパク質+RNAseqリファレンス
BRAKERBUSCO値:91.8%- 遺伝子数:13067
- タンパク質リファレンス
- フェモラータゲノム
- タンパク質リファレンス
BRAKERBUSCO値:92.0%- 遺伝子数:16856
- タンパク質+RNAseqリファレンス
BRAKERBUSCO値:73.2%- 遺伝子数:10818
- タンパク質リファレンス
ちなみに新規マダラゲノム全体でかかった解析時間は127.5時間だった。
0617
コフキゲノムのソフトマスク
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 48
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Kohuki_data -pa 12
dateblastpを用いた機能アノテーション、その前準備のレファレンス作成
~/reference_sequence/Sory_Tcas_Dmel_Ecol_refを作成。レファレンスに使う種は
- 大腸菌Escherichia coli
- ショウジョウバエDrosophila melanogaster
- コクヌストモドキTribolium castaneum
- ココクゾウムシSitophilus oryzae
とした。この4種のprotein.faaを上記ディレクトリにコピー。
レファレンスとするにはこれらを1つのファイルへと結合しなければならないが、それぞれの遺伝子がどの種由来かわからなくなる。そこでperlスクリプトによりヘッダーの頭に種名を加えるよう加工した。
### ヘッダー加工に使ったperlスクリプト add_hoge_to_headers.plの中身
#!/usr/bin/perl
use strict;
use warnings;
use File::Copy qw(move);
# 入力ファイルと一時ファイルの定義
my $input_fasta = 'Dmel_protein.fasta';
my $temp_fasta = 'temp.fasta';
# 入力ファイルを開く
open(my $in, '<', $input_fasta) or die "Cannot open $input_fasta: $!";
# 一時ファイルを開く
open(my $out, '>', $temp_fasta) or die "Cannot open $temp_fasta: $!";
# 行ごとに処理
while (my $line = <$in>) {
if ($line =~ /^>/) {
# ヘッダー行にhoge_を追加
$line =~ s/^>(.*)/>Dmel_$1/;
}
print $out $line;
}
# ファイルを閉じる
close($in);
close($out);置き換えたいファイルの名前を$input_fastaに指定。
実行の際は以下の通り
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ chmod +x add_hoge_to_headers.pl
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ ./add_hoge_to_headers.pl手を加えたファイルを結合。
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ cat Dmel_protein.fasta Ecol_protein.fasta Sory_protein.fasta Tcas_protein.fasta > merge_4sp.faaデータベースの構築
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ singularity exec -e /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 makeblastdb -in merge_4sp.faa -out merge_4sp -dbtype prot -hash_index
WARNING: Skipping mount /opt/pkg/singularity-ce/4.0.0/var/singularity/mnt/session/etc/resolv.conf [files]: /etc/resolv.conf doesn't exist in container
Building a new DB, current time: 06/17/2024 16:46:13
New DB name: /home/kosukesano/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/merge_4sp
New DB title: merge_4sp.faa
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 82065 sequences in 4.80911 seconds.
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ ls
Dmel_protein.fasta Sory_protein.fasta add_hoge_to_headers.pl merge_4sp.phd merge_4sp.phr merge_4sp.pog merge_4sp.psi
Ecol_protein.fasta Tcas_protein.fasta merge_4sp.faa merge_4sp.phi merge_4sp.pin merge_4sp.psd merge_4sp.psq
kosukesano@at139:~/reference_sequence/Sory_Tcas_Dmel_Ecol_ref$ blast用のシェルスクリプト準備
### blastp_4sp_test.shの中身
#!/bin/bash
#$ -S /bin/sh
#$ -pe def_slot 8
#$ -l s_vmem=64G,mem_req=64G
#$ -cwd
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando
echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID
echo starting at
date
#BLASTの標準列名を定義
header="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"
#出力ファイルの定義
output_file="out_madara_blastp_test.txt"
#列名を出力ファイルに書き込む
echo "$header" > $output_file
#BLASTpを実行し、結果を追加する
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/madara_protein.fasta \
-db ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/merge_4sp \
-evalue 1e-04 \
-outfmt 6 >> $output_file
echo ending at
dateSingularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seqkit sort --quiet -l conitg.fa Sfem_softmasked.fasta | seqkit fx2tab -l -n -i -H > length.txt
0618
blastpの結果
### out_madara_blastp_testの中身
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
g704.t1 Sory_XP_030750051.1 82.587 201 34 1 1 201 1 200 8.77e-99 336
g704.t1 Sory_XP_030750049.1 82.587 201 34 1 1 201 1 200 2.24e-98 335
g704.t1 Sory_XP_030750048.1 82.587 201 34 1 1 201 1 200 2.85e-98 336
g704.t1 Sory_XP_030750047.1 82.587 201 34 1 1 201 1 200 2.85e-98 336
g704.t1 Tcas_XP_973129.2 69.268 205 56 1 1 198 1 205 3.66e-88 300
g704.t1 Tcas_XP_973129.2 42.857 217 89 5 873 1076 459 653 9.64e-14 77.4
g704.t1 Dmel_NP_001401025.1 55.238 210 74 3 1 196 1 204 3.49e-67 228
g704.t1 Dmel_NP_647642.2 52.655 226 85 4 1 210 1 220 1.63e-65 242
g704.t1 Dmel_NP_001246548.1 52.655 226 85 4 1 210 1 220 1.63e-65 242
g704.t1 Dmel_NP_001400988.1 52.655 226 85 4 1 210 1 220 2.69e-65 241
g704.t1 Dmel_NP_728652.1 52.655 226 85 4 1 210 1 220 3.76e-65 241
g704.t1 Dmel_NP_001401027.1 52.655 226 85 4 1 210 1 220 5.79e-65 241
g704.t1 Dmel_NP_001401026.1 52.655 226 85 4 1 210 1 220 1.06e-64 240
g704.t1 Dmel_NP_001097475.2 56.140 114 49 1 1 114 1 113 6.41e-34 144
g704.t1 Sory_XP_030750050.1 86.364 88 12 0 114 201 89 176 4.61e-31 134
g704.t1 Sory_XP_030750053.1 80.851 94 16 1 108 201 2 93 8.27e-31 132
g704.t1 Sory_XP_030750052.1 86.207 87 12 0 115 201 14 100 1.59e-30 132
g704.t1 Dmel_NP_001286903.1 52.778 108 32 2 103 196 5 107 2.62e-25 105
g704.t1 Tcas_XP_008198255.2 61.957 92 28 1 114 198 71 162 1.21e-22 105
g704.t1 Tcas_XP_008198255.2 45.622 217 83 4 873 1076 416 610 1.20e-13 77.0
g704.t1 Dmel_NP_728653.2 46.980 149 55 4 103 232 5 148 2.12e-22 96.7
g704.t1 Tcas_XP_008198256.2 61.538 91 28 1 115 198 11 101 4.04e-22 103
g704.t1 Tcas_XP_008198256.2 45.622 217 83 4 873 1076 355 549 4.51e-14 78.2
g706.t1 Sory_XP_030750036.1 85.816 423 55 4 1 423 1 418 0.0 746できてる!
レファレンスをマダラに、クエリーをその他4種にしたblastp
データベースの構築
kosukesano@at139:~/reference_sequence/Madara$ singularity exec -e /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 makeblastdb -in madara_protein.fasta -out madara_ref -dbtype prot -hash_in
dex
WARNING: Skipping mount /opt/pkg/singularity-ce/4.0.0/var/singularity/mnt/session/etc/resolv.conf [files]: /etc/resolv.conf doesn't exist in container
Building a new DB, current time: 06/18/2024 12:51:23
New DB name: /home/kosukesano/reference_sequence/Madara/madara_ref
New DB title: madara_protein.fasta
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 16570 sequences in 0.594244 seconds.
kosukesano@at139:~/reference_sequence/Madara$ lsblast用のシェルスクリプト準備
### blastp_RefAsMadara.shの中身
#!/bin/bash
#$ -S /bin/sh
#$ -pe def_slot 8
#$ -l s_vmem=64G,mem_req=64G
#$ -cwd
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando
echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID
echo starting at
date
#BLASTの標準列名を定義
header="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"
#出力ファイルの定義
output_file="out_madara_as_ref_blastp_.txt"
#列名を出力ファイルに書き込む
echo "$header" > $output_file
#BLASTpを実行し、結果を追加する
#ココクゾウムシのblastp
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/Sory_protein.fasta \
-db ${HOME}/reference_sequence/Madara/madara_ref \
-evalue 1e-04 \
-outfmt 6 \
-out out_Sory_blastp_RefAsMadara.txt
#コクヌストモドキのblastp
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/Tcas_protein.fasta \
-db ${HOME}/reference_sequence/Madara/madara_ref \
-evalue 1e-04 \
-outfmt 6 \
-out out_Tcas_blastp_RefAsMadara.txt
#ショウジョウバエのblastp
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/Dmel_protein.fasta \
-db ${HOME}/reference_sequence/Madara/madara_ref \
-evalue 1e-04 \
-outfmt 6 \
-out out_Dmel_blastp_RefAsMadara.txt
#大腸菌のblastp
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query ${HOME}/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/Ecol_protein.fasta \
-db ${HOME}/reference_sequence/Madara/madara_ref \
-evalue 1e-04 \
-outfmt 6 \
-out out_Ecol_blastp_RefAsMadara.txt
echo ending at
dateマダラの種名をヘッダーにするの忘れてた…..
0620
コフキゲノムソフトマスクの続き
kosukesano@at139:~/tools/for_softmask/kohuki_softmask/RM_20252.MonJun171354072024$ ls
consensi.fa consensi.fa.classified families-classified.stk families.stk round-1 round-2 round-3 round-4 round-5 round-6 tmpConsensi.fa
kosukesano@at139:~/tools/for_softmask/kohuki_softmask/RM_20252.MonJun171354072024$ consensi.fa.classifiedが出力され、RepeatModelerは終了した。
続いてRepeatMaskerを動かす
### Kohuki_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatMasker -pa 6 -lib ~/tools/for_softmask/kohuki_softmask/RM_20252.MonJun171354072024/consensi.fa.classified /home/kosukesano/tools/for_softmask/kohuki_softmask/180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta
dateこれをqsub_betaで投げた
0625
遺伝研の緊急メンテナンスが修了!
0620のKohuki_RepeatMasker.shをもう一度投げた。
Orthofinderのテスト
~/tools/にfor_orthofinderディレクトリを作成。その中でSmad_Agra_Cass_Dpon_Sory_Tcas_fasta_dirディレクトリを作成し、以下の6種のタンパク質ファイル(.fasta)を入れた。
- Smycronyx madaranus(マダラケシツブゾウムシ)
- Anthonomus grandis grandis(ワタミハナゾウムシ)
- Dendroctonus ponderosae(マツノキクイムシ)
- Sitophilus oryzae(ココクゾウムシ)
- Ceutorhynchus assimilis(キャベツサヤゾウムシ)
- Tribolium castaneum(コクヌストモドキ)
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir$ ls
Agra.fasta Cass.fasta Dpon.fasta OrthoFinder Smad.fasta Sory.fasta Tcas.fastaまた、~/tools/for_orthofinder/直下にOrthofinder実行シェルスクリプトSmad_Agra_Cass_Dpon_Sory_Tcas_orthotest.shを作成した。
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder -f ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir -t 5 -a 5
dateこれをqsubで投げた。
結果
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25$ ls
Citation.txt Gene_Duplication_Events Log.txt Orthogroups Phylogenetic_Hierarchical_Orthogroups Putative_Xenologs Single_Copy_Orthologue_Sequences WorkingDirectory
Comparative_Genomics_Statistics Gene_Trees Orthogroup_Sequences Orthologues Phylogenetically_Misplaced_Genes Resolved_Gene_Trees Species_Tree
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25$ 出力ファイルが全て揃った!実行できた!
0627
CAFEの前処理
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/以下を全てローカルに転送した。
ローカルでは~/bio/for_cafe/Original_dataを作り、上記ディレクトリを格納した。
Rstudioで以下のコードを実行した。
Orthologs_raw <- read_tsv(paste("Original_data/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups.GeneCount.tsv", sep = "/"))
##Enzanはorthogroupのなかで遺伝子数が変なやつを検出するためのmatrix
Enzan <- Orthologs_raw %>%
select(!c(Orthogroup, Total)) %>%
t()
##saidai, saisyouは各Orthogroupの中で、各種が持っているコピー数の最大値及び最小値を記したdf
saidai <- Enzan %>%
apply(2, max) %>%
as.data.frame() %>%
rename(max_real = ".")
saisyou <- Enzan %>%
apply(2, min) %>%
as.data.frame() %>%
rename(min_real = ".")
##Orthologs_1は各Orthogroupsの最大値、最小値もくっつけたdf
Orthologs_1 <- Orthologs_raw %>% select(!c(Total)) %>%
bind_cols(saidai, saisyou)
##最大値と最小値の差
Orthologs_2 <-Orthologs_1 %>%
mutate(sa = max_real - min_real) %>%
filter(max_real != min_real) %>%
filter(sa < 50)
##外れ値と遺伝子ファミリー数が全種で共通の行を省いた。最後に1列目を複製し列名をいじって、CAFEへのインプットデータの出来上がり。
Orthologs_3 <- Orthologs_2 %>%
mutate(Description = Orthogroup, ID = Orthogroup) %>%
relocate(Description, ID) %>%
select(!c(Orthogroup, max_real, min_real, sa))
Orthologs_3 %>%
write_tsv(paste("Processed_data/Orthogroups.GeneCount2.tsv", sep = "/"))#, quote = FALSE) #,row.names = FALSE)
##Did you finish creating ultrametric tree with makeultrametric.R?
############
tree = read.tree("Original_data/OrthoFinder/Results_Jun25/Species_Tree/SpeciesTree_rooted.txt")
mrca = getMRCA(tree, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
tree,
100000,
age.min = 152.3, # 推定分岐年代の最小値(MYA)
age.max = 236.2, # 推定分岐年代の最大値(MYA)
node = mrca, # getMRCAで指定したノード
S = 1,
tol = 1e-20,
CV = FALSE,
eval.max = 500,
iter.max = 500
)
is.ultrametric(tree2) # ultrametricかどうか確認
write.tree(tree2, file = "tree_ultrametric.nwk") # ultrametric系統樹の保存
こうしてできたOrthogroups.GeneCount2.tsvとtree_ultrametric.nwkをDDBJの~/tools/for_cafe/madara_4weevil_Tcas_cafetestに転送した。
DDBJの~/tools/for_cafe/madara_4weevil_Tcas_cafetestにて、CAFE5を実行した。
kosukesano@at138:~/tools/for_cafe/madara_4weevil_Tcas_cafetest$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk
Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk
Filtering families not present at the root from: 12784 to 8037
No root family size distribution specified, using uniform distribution
Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1
Starting Search for Initial Parameter Values
Lambda: 0.001938743751488
Score (-lnL): 112660.90445756
Lambda: 0.001938743751488
Score (-lnL): 112660.90445756
Lambda: 0.0020356809390624
Score (-lnL): 112528.29246004
Lambda: 0.0021326181266368
Score (-lnL): 112446.2150052
Lambda: 0.0022295553142112
Score (-lnL): 112408.34055546
Lambda: 0.00242342968936
.
.
.
.
.
.
.
(省略)
.
.
.
Completed 20 iterations
Time: 0H 0M 2S
Best match is: 0.0022734326447198
Final -lnL: 112404.42731051
42 values were attempted (0% rejected)
Inferring processes for Base model
Score (-lnL): 112404.42731051
Maximum possible lambda for this topology: 0.004233700254022
Computing pvalues...
done!
Starting reconstruction processes for Base model
Done!
kosukesano@at138:~/tools/for_cafe/madara_4weevil_Tcas_cafetest$結果
kosukesano@at138:~/tools/for_cafe/madara_4weevil_Tcas_cafetest$ ls
Orthogroups.GeneCount2.tsv results tree_ultrametric.nwk
kosukesano@at138:~/tools/for_cafe/madara_4weevil_Tcas_cafetest$ ls results/
Base_asr.tre Base_branch_probabilities.tab Base_change.tab Base_clade_results.txt Base_count.tab Base_family_likelihoods.txt Base_family_results.txt Base_results.txtちゃんとファイルが出力された!
0628
コフキゲノムのソフトマスク結果
kosukesano@at138:~/tools/for_softmask/kohuki_softmask$ ls RM_32208.TueJun251102422024/
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45387.masked
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.cat.all.gz 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45387.tmp.simple1
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45383.cat 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45388.masked
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45383.masked 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45388.tmp.simple1
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45384.cat consensi.fa.classified.ndb
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45384.masked consensi.fa.classified.nhr
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45385.cat consensi.fa.classified.nin
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45385.masked consensi.fa.classified.njs
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked consensi.fa.classified.not
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.2.3.5.75.20.33.7.summary.html consensi.fa.classified.nsq
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s1.2.3.5.75.20.33.7.1.html consensi.fa.classified.ntf
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s1.2.3.5.75.20.33.7.1.txt.html consensi.fa.classified.nto
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s3.2.3.5.75.20.33.7.1.html makeblastdb.log
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s3.2.3.5.75.20.33.7.1.txt.html ncResults-1719541376-36687.err
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.masked.s4.2.3.5.75.20.33.7.txt.html ncResults-1719541376-36687.out
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.tmp.custom trfResults-1719541376-29544.err
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta_batch-45386.tmp.simple1 trfResults-1719541377-29544.out
kosukesano@at138:~/tools/for_softmask/kohuki_softmask$ 途中で終わっているみたい。メモリ不足?
Reciprocal Best Hitの探索
### rbh.pyの中身
# -*- coding: utf-8 -*-
import pandas as pd
engine='python'
# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
'qseqid': str,
'sseqid': str,
'pident': float,
'length': int,
'mismatch': int,
'gapopen': int,
'qstart': int,
'qend': int,
'sstart': int,
'send': int,
'evalue': float,
'bitscore': float
}
# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
rbh_hits = []
for reverse_file in reverse_files:
reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
# RBHの判定
for idx, row in forward.iterrows():
query, subject = row['qseqid'], row['sseqid']
reverse_hit = reverse[(reverse['qseqid'] == subject) & (reverse['sseqid'] == query)]
if not reverse_hit.empty:
rbh_hits.append(row)
# RBH結果の保存
rbh_df = pd.DataFrame(rbh_hits)
rbh_df.to_csv('reciprocal_best_hits_madara.txt', sep='\s+', index=False)
~### rbh.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID
echo starting at
date
python /home/kosukesano/reference_sequence/rbh.py
echo ending at
date注意
- ノードは
gpuを指定すること。mediumにはpandasが入っていない。 - メモリは12以上を指定すること。6だとqwのままランしない。
Orthofinderのアウトプットから種の系統樹を構築する
遺伝研スパコンの~/tools/for_orthofinder/下にmake_philo_treeというディレクトリを作成し、以下のスクリプトfasta_concatinate.shを作成した。
### fasta_concatinate.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir" ## Please replace with the actual directory containing the fasta files
# Define the output directory and output file
new="output_directory"
mkdir -p $new
# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
cat "$file" >> "./${new}/all_seq.fa"
done
dateこれを作業ノードで実行権限を付与して実行した。
kosukesano@at138:~/tools/for_orthofinder/make_philo_tree$ chmod +x fasta_concatinate.sh
kosukesano@at138:~/tools/for_orthofinder/make_philo_tree$ ./fasta_concatinate.sh
start at
Fri Jun 28 17:13:55 JST 2024
Fri Jun 28 17:13:56 JST 2024
kosukesano@at138:~/tools/for_orthofinder/make_philo_tree$output_directory/all_seq.faに全ての.fastaファイルをConcatinateしたファイルができた。
2024年7月
0701
コフキゲノムのRepeatMasker結果
0628に再度投げたジョブが終わっていた。~/tools/for_softmask/kohuki_softmaskに180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.cat.gzができたので成功!
上手く行ったスクリプトは以下。
### Kohuki_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 48
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatMasker -pa 12 -lib ~/tools/for_softmask/kohuki_softmask/RM_20252.MonJun171354072024/consensi.fa.classified /home/kosukesano/tools/for_softmask/kohuki_softmask/180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta
dateコフキゲノムのProcessRepeat
以下のスクリプトをジョブとして投げた。
### Kohuki_ProcessRepeat.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 48
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
ProcessRepeats -maskSource 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta -xsmall -gff 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.cat.gz
date0702
ReciprocalBestHitの検索
# -*- coding: utf-8 -*-
import pandas as pd
engine='python'
print('start')
# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
'qseqid': str,
'sseqid': str,
'pident': float,
'length': int,
'mismatch': int,
'gapopen': int,
'qstart': int,
'qend': int,
'sstart': int,
'send': int,
'evalue': float,
'bitscore': float
}
print('forward_BLAST was scanned')
# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
rbh_hits = []
for reverse_file in reverse_files:
reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
print('reverse_BLAST', reverse, ' was scanned')
# RBHの判定
for idx, row in forward.iterrows():
query, subject = row['qseqid'], row['sseqid']
reverse_hit = reverse[(reverse['qseqid'] == subject) & (reverse['sseqid'] == query)]
if not reverse_hit.empty:
rbh_hits.append(row)
#print(reverse, ' was judged')
print(query, 'was judged')
# RBH結果の保存
rbh_df = pd.DataFrame(rbh_hits)
rbh_df.to_csv('reciprocal_best_hits_madara.txt', sep='\s+', index=False)これを実行したのだが、めちゃくちゃ遅い。どうもforward.iterrows()が悪さをしているらしい。
forward.iterrows()を使わないスクリプトを作成し、実行。
### new_rbh.pyの中身
import pandas as pd
print('start')
# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
'qseqid': str,
'sseqid': str,
'pident': float,
'length': int,
'mismatch': int,
'gapopen': int,
'qstart': int,
'qend': int,
'sstart': int,
'send': int,
'evalue': float,
'bitscore': float
}
print('forward_BLAST was scanned')
# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
rbh_hits = []
for reverse_file in reverse_files:
reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
print('reverse_BLAST', reverse_file, ' was scanned')
# forwardとreverseをマージし、条件に合う行を抽出
merged = forward.merge(reverse, left_on=['qseqid', 'sseqid'], right_on=['sseqid', 'qseqid'], suffixes=('_fwd', '_rev'))
rbh_hits.extend(merged.to_dict('records'))
print(len(merged), 'hits were judged')
# RBH結果の保存
rbh_df = pd.DataFrame(rbh_hits)
rbh_df.to_csv('reciprocal_best_hits_madara.txt', sep='\t', index=False)
import pandas as pd
print('start')
# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
'qseqid': str,
'sseqid': str,
'pident': float,
'length': int,
'mismatch': int,
'gapopen': int,
'qstart': int,
'qend': int,
'sstart': int,
'send': int,
'evalue': float,
'bitscore': float
}
print('forward_BLAST was scanned')
# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
rbh_hits = []
for reverse_file in reverse_files:
reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
print('reverse_BLAST', reverse_file, ' was scanned')
# forwardとreverseをマージし、条件に合う行を抽出
merged = forward.merge(reverse, left_on=['qseqid', 'sseqid'], right_on=['sseqid', 'qseqid'], suffixes=('_fwd', '_rev'))
rbh_hits.extend(merged.to_dict('records'))
print(len(merged), 'hits were judged')
# RBH結果の保存
rbh_df = pd.DataFrame(rbh_hits)
rbh_df.to_csv('reciprocal_best_hits_madara.txt', sep='\t', index=False)成功!
~/reference_sequenceにreciprocal_best_hits_madara.txtができた。
### reciprocal_best_hits_madara.txtの中身
qseqid_fwd sseqid_fwd pident_fwd length_fwd mismatch_fwd gapopen_fwd qstart_fwd qend_fwd sstart_fwd send_fwd evalue_fwd bitscore_fwd qseqid_rev sseqid_rev pident_rev length_rev mismatch_rev gapopen_rev qstart_rev qend_rev sstart_rev send_rev evalue_rev bitscore_rev
g704.t1 Sory_XP_030750051.1 82.587 201 34 1 1 201 1 200 8.77e-99 336 Sory_XP_030750051.1 g704.t1 83.505 194 31 1 1 193 1 194 1.07e-99 336.0
g704.t1 Sory_XP_030750049.1 82.587 201 34 1 1 201 1 200 2.24e-98 335 Sory_XP_030750049.1 g704.t1 83.505 194 31 1 1 193 1 194 2.67e-99 336.0
g704.t1 Sory_XP_030750048.1 82.587 201 34 1 1 201 1 200 2.85e-98 336 Sory_XP_030750048.1 g704.t1 83.505 194 31 1 1 193 1 194 2.54e-99 336.0
g704.t1 Sory_XP_030750047.1 82.587 201 34 1 1 201 1 200 2.85e-98 336 Sory_XP_030750047.1 g704.t1 83.505 194 31 1 1 193 1 194 2.54e-99 336.0
g704.t1 Sory_XP_030750050.1 86.364 88 12 0 114 201 89 176 4.61e-31 134 Sory_XP_030750050.1 g704.t1 88.889 81 9 0 89 169 114 194 1.59e-31 132.0
g704.t1 Sory_XP_030750053.1 80.851 94 16 1 108 201 2 93 8.27e-31 132 Sory_XP_030750053.1 g704.t1 84.706 85 11 1 4 86 110 194 4.19e-31 131.0
g704.t1 Sory_XP_030750052.1 86.207 87 12 0 115 201 14 100 1.59e-30 132 Sory_XP_030750052.1 g704.t1 89.873 79 8 0 15 93 116 194 6.350000000000001e-31 130.0
g706.t1 Sory_XP_030750036.1 85.816 423 55 4 1 423 1 418 0.0 746 Sory_XP_030750036.1 g706.t1 85.816 423 55 4 1 418 1 423 0.0 724.0
g706.t1 Sory_XP_030750034.1 85.309 388 52 4 36 423 46 428 0.0 675 Sory_XP_030750034.1 g706.t1 85.309 388 52 4 46 428 36 423 0.0 655.0
g706.t1 Sory_XP_030750033.1 84.478 393 52 5 35 423 62 449 0.0 674 Sory_XP_030750033.1 g706.t1 84.478 393 52 5 62 449 35 423 0.0 654.0
g706.t1 Sory_XP_030750037.1 86.126 382 48 4 42 423 13 389 0.0 672 Sory_XP_030750037.1 g706.t1 86.126 382 48 4 13 389 42 423 0.0 650.0
g706.t1 Sory_XP_030750035.1 84.655 391 54 5 33 423 39 423 0.0 671 Sory_XP_030750035.1 g706.t1 84.655 391 54 5 39 423 33 423 0.0 652.0
g706.t1 Sory_XP_030763203.1 85.676 370 46 4 27 396 17 379 0.0 650 Sory_XP_030763203.1 g706.t1 85.676 370 46 4 17 379 27 396 コフキゲノムのProcessRepeat結果
### 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.maskedの中身
>0 edges=0 left=4 right=5 ver=1.10 style=1
ACCCAGTCCCACATCCTTCATATCCACCAGTATATTAGAAGCAATTTTCT
CATCTCTATTTGGCTCCGTTACTTTACTTGTTATGACCTACAATATTTTA
ATTGAAATCAGTACTTTCACATCAATACTGATATTTACTAGTTTTGTTGA
AACCGACCGATACCGATTTTTTGGTCATATCAGTCATTGATCCGATTTTC
CTGGCCTGCCGTTTACGCAAATTAAGTTAATTATTAAGTAACTACATGAC
TTAAAATTTCTCTAAATTAAAGTTACTCACTATAACTAAAATATTATTGT
AATGAGTAAGATTCCACATTATTAGACAATGTGTAACcagaggcggtttt
tccattggtttatttgtgcagtaaccacccaataaaataatacagttttc
aaattctcacaatatatcattaaaataatataactttatattctattaca
taaattattatttacaggttaagctctaTAAgtggaatttataaaaaaaa
tagaaatagtttcggaaatcgagtgcttatataaaagaatggacttcctc
ggccgaagataaacttagtttaagaatgtgtcataagaaactaaccccca
aattaaccaaagggatttaatttaaactaggtctaaattgacaatcgcaa
aatggagccagctgagtattcaagatatttggcgtttttgtcattgcatt
gatttattaaaatattttacataaataataatttaaaaaaagtttaaacg
tagttttaaggttgaacaatattgaataatttcgttggttTTAATCGAAA
ATTTAATTATTAGTAAGATTAAAACACtatgtttttgggccacgccccaa
atttttttagaaggttagaaaatatattgtttttatagtacacaattaat
atttttatggtaaatcaatattatagcttgttaaccatagacaaaccctc
tttgtgcgaaagtgggcctaaaaccaagggctacaaataaaaggagagcg
atatgctaa成功!
コフキゲノムのBRAKER実行に移る。
コフキゲノムのBRAKER
~/tools/for_braker/nama_dataに180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta.maskedをkohuki_softmasked.fastaという名前でコピー。
~/tools/for_braker/Kohukiディレクトリを作成し、その直下で以下のシェルスクリプトを作成した。
### kohuki_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/kohuki_softmasked.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--threads=16\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
echo end at
dateこれをジョブとして投げた。
コフキゲノムのBUSCO
~/tools/for_braker/nama_dataにkohuki_busco.shを作成し、ジョブとして投げた
### kohuki_busco.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
echo start at
date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
-m genome\
-i /home/kosukesano/tools/for_braker/nama_data/kohuki_softmasked.fasta\
-o BUSCO_OUTPUT_KOHUKI_GENOME\
-l\
/home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
-f
date0703
ReciprocalBestHitの検索
それぞれの生物種ごとにRBHを検索し、bitscoreが最も高い行のみを抽出する。bitscoreが同じ場合は一番上のもののみを抽出する。
上手くいったスクリプトは以下の通り。/home/kosukesano/reference_sequence/new_rbh.py
### new_rbh.pyの中身
import pandas as pd
print('start')
# forward BLAST結果の読み込み
forward = pd.read_csv('out_madara_blastp_test.txt', sep='\s+', header=None, low_memory=False)
forward.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
dtypes = {
'qseqid': str,
'sseqid': str,
'pident': float,
'length': int,
'mismatch': int,
'gapopen': int,
'qstart': int,
'qend': int,
'sstart': int,
'send': int,
'evalue': float,
'bitscore': float
}
print('forward_BLAST was scanned')
# 逆BLAST結果の読み込みと処理
reverse_files = ['out_Sory_blastp_RefAsMadara.txt', 'out_Tcas_blastp_RefAsMadara.txt', 'out_Dmel_blastp_RefAsMadara.txt', 'out_Ecol_blastp_RefAsMadara.txt']
for reverse_file in reverse_files:
reverse = pd.read_csv(reverse_file, sep='\s+', header=None, low_memory=False)
reverse.columns = ['qseqid', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue', 'bitscore']
# forwardとreverseをマージし、条件に合う行を抽出
merged = forward.merge(reverse, left_on=['qseqid', 'sseqid'], right_on=['sseqid', 'qseqid'], suffixes=('_fwd', '_rev'))
print(len(merged), 'hits were judged')
# bitscore_fwdを数値型に変換
merged['bitscore_fwd'] = pd.to_numeric(merged['bitscore_fwd'], errors='coerce')
# qseqid_fwdが重複する行の中で、bitscore_fwdが最も高い行だけを抽出
idx = merged.groupby('qseqid_fwd')['bitscore_fwd'].idxmax()
best_hits = merged.loc[idx]
# 各ファイルごとにRBH結果を保存
output_file = f'reciprocal_best_hits_{reverse_file.split("_")[1]}.txt'
best_hits.to_csv(output_file, sep='\t', index=False) # タブ文字を区切り文字として使用
print('All RBH results have been saved.')実行結果は以下の通り。
kosukesano@at137:~/reference_sequence$ python new_rbh.py
start
forward_BLAST was scanned
4877948 hits were judged
12427340 hits were judged
8626859 hits were judged
21265 hits were judged
All RBH results have been saved.
kosukesano@at137:~/reference_sequence$
kosukesano@at137:~/reference_sequence$ ls reciprocal_best_hits_*
reciprocal_best_hits_Dmel.txt reciprocal_best_hits_Ecol.txt reciprocal_best_hits_Sory.txt reciprocal_best_hits_Tcas.txt reciprocal_best_hits_madara.txt
kosukesano@at137:~/reference_sequence$reciprocal_best_hits_Dmel.txt、reciprocal_best_hits_Ecol.txt、reciprocal_best_hits_Sory.txt、reciprocal_best_hits_Tcas.txtが出力された。
これらのファイルを遺伝子名で結合する
### merge_rbh.pyの中身
import pandas as pd
# 出力ファイルのリスト
output_files = ['reciprocal_best_hits_Ecol.txt', 'reciprocal_best_hits_Dmel.txt', 'reciprocal_best_hits_Tcas.txt', 'reciprocal_best_hits_Sory.txt']
# 各ファイルからqseqid_fwdとsseqid_fwdを抽出
dataframes = []
for file in output_files:
df = pd.read_csv(file, sep='\t', low_memory=False)
df = df[['qseqid_fwd', 'sseqid_fwd']]
df.columns = ['qseqid_fwd', f'sseqid_fwd_{file.split("_")[2]}']
dataframes.append(df)
# qseqid_fwdを基準に横に結合
merged_df = dataframes[0]
for df in dataframes[1:]:
merged_df = pd.merge(merged_df, df, on='qseqid_fwd', how='outer')
# 結果を保存
merged_df.to_csv('merged_best_hits.txt', sep='\t', index=False)
print('Merged results have been saved.')実行結果は以下の通り
kosukesano@at137:~/reference_sequence$ python merge_rbh.py
/lustre7/home/kosukesano/reference_sequence/merge_rbh.py:17: FutureWarning: Passing 'suffixes' which cause duplicate columns {'sseqid_fwd_hits_x'} in the result is deprecated and will raise a MergeError in a future version.
merged_df = pd.merge(merged_df, df, on='qseqid_fwd', how='outer')
Merged results have been saved.
kosukesano@at137:~/reference_sequence$0705
フェモラータのゲノムが違う?
最新版のフェモラータゲノム(pilonでアセンブルされている)を/Volumes/Elements_1/240705/2023.11.22.polished.annotated.genomeに置いた。
:/Volumes/Elements_1/240705/2023.11.22.polished.annotated.genome$ ls
Bessho Sfem_protein.faa assembly.pilon.annotation.txt bwa_pilon_sagra_20231019.txt
Sfem_cds.fasta assembly.pilon.annotation.gff assembly.pilon.fasta eukaryotic_gene_finding.pdf遺伝研スパコンで~/tools/for_softmask/nama_data/Sfem_pilonディレクトリを作成し、assembly.pilon.fastaをコピーした。
###ローカルで実行
:~/Desktop$ scp /Volumes/Elements_1/240705/2023.11.22.polished.annotated.genome/assembly.pilon.fasta kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data/Sfem_pilon
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
assembly.pilon.fasta 100% 479MB 75.7MB/s 00:06
:~/Desktop$###遺伝研スパコンの様子
kosukesano@at137:~/tools/for_softmask/nama_data/Sfem_pilon$ ls
assembly.pilon.fasta新規フェモラータゲノムのソフトマスク
遺伝研スパコンにて~/tools/for_softmask/Sfemorata_pilon_softmaskディレクトリを作成し、以下を実行。
kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$ source ~/pyenv_conda_environment/.pyenv_profile
kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$ source ~/tools/pyenv_env/EDTA_profile
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$ BuildDatabase -name Sfem_BLAST_DATABASE ~/tools/for_softmask/nama_data/Sfem_pilon/assembly.pilon.fasta
Building database Sfem_BLAST_DATABASE:
Reading /home/kosukesano/tools/for_softmask/nama_data/Sfem_pilon/assembly.pilon.fasta...
Number of sequences (bp) added to database: 5084 ( 495481058 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$ ls
Sfem_BLAST_DATABASE.nhr Sfem_BLAST_DATABASE.njs Sfem_BLAST_DATABASE.nni Sfem_BLAST_DATABASE.nsq
Sfem_BLAST_DATABASE.nin Sfem_BLAST_DATABASE.nnd Sfem_BLAST_DATABASE.nog Sfem_BLAST_DATABASE.translation
(EDTA2) kosukesano@at137:~/tools/for_softmask/Sfemorata_pilon_softmask$Sfem_RepeatModeler.shを作成しジョブとして投げた。
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Sfem_BLAST_DATABASE -pa 6
date昔フェモラータのソフトマスクしたときのファイルをほぼコピーしただけ
blastpの結果と機能アノテーションの紐付け
まずショウジョウバエのgenomic.gffから、遺伝子IDと機能情報を対応させた辞書を作る。作成用のスクリプトは以下の通り
### makedic_test.pyの中身
import pandas as pd
import re
# GFFファイルのパス
gff_file = '/home/kosukesano/old_envilonment_until20240430/outgroup/Drosophila_melanogaster/ncbi_dataset/data/GCF_000001215.4/genomic.gff'
# merged_best_hits.txtのパス
merged_file = 'merged_best_hits.txt'
# gene_function.txtのパス
gene_function_file = 'gene_function.txt'
# GFFファイルから遺伝子IDとタンパク質の機能名を抽出
gene_function = {}
with open(gff_file, 'r') as file:
for line in file:
if line.startswith('#') or line.strip() == '':
continue
parts = line.strip().split('\t')
if len(parts) < 9:
continue
attributes = parts[8]
match_gene_id = re.search(r'GeneID:(\d+)', attributes)
match_product = re.search(r'product=([^;]+)', attributes)
if match_gene_id and match_product:
gene_id = match_gene_id.group(1)
product = match_product.group(1)
gene_function[gene_id] = product
# gene_functionをgene_function.txtとして保存
with open(gene_function_file, 'w') as file:
for gene_id, product in gene_function.items():
file.write(f"{gene_id}\t{product}\n")0708
blastpの結果と機能アノテーションの紐付け続き
各生物のprotein.faaのヘッダー行を抽出して辞書を作成、それを次々に結合。
実行コードが記されたpythonファイルは以下の通り。
### fnanno.pyの中身
import pandas as pd
# merged_best_hits.txtのパスと読み込み
merged_best_hits_file = 'merged_best_hits.txt'
merged_df = pd.read_csv(merged_best_hits_file, sep='\t')
#出力ファイルのパス
output_file = 'merged_with_gene_function.csv'
output_file2 = 'merged_with_gene_function.txt'
##########################################################
###Dmel
# Dmel_protein.faaのパス
dmel_protein_file = 'Dmel_protein.faa'
# 遺伝子IDと遺伝子機能を抽出するリスト
gene_ids = []
gene_functions = []
# Dmel_protein.faaファイルを読み込み、ヘッダー行から情報を抽出
with open(dmel_protein_file, 'r') as file:
for line in file:
if line.startswith('>'):
parts = line.strip().split(' ', 1)
gene_id = "Dmel_" + parts[0][1:] # ">"を取り除き、"Dmel_"を追加
parts2 = parts[1].split(' [')
gene_function = parts2[0] # 遺伝子機能
gene_ids.append(gene_id)
gene_functions.append(gene_function)
# 遺伝子IDと遺伝子機能のデータフレームを作成
data = {'GeneID': gene_ids, 'GeneFunction': gene_functions}
df = pd.DataFrame(data)
# マージ
merged_df = pd.merge(
merged_df,
df,
left_on='Dmelanogaster',
right_on='GeneID',
how='left'
)
# 不要なGeneID列を削除し、列名をDmel_GeneFunctionに変更
merged_df = merged_df.drop(columns=['GeneID'])
merged_df = merged_df.rename(columns={'GeneFunction': 'Dmel_GeneFunction'})
# 確認のため最初の数行を表示
print("Dmel")
print(merged_df)
####################################################################
###Ecol
# Ecol_protein.faaのパス
Ecol_protein_file = 'Ecol_protein.faa'
# 遺伝子IDと遺伝子機能を抽出するリスト
gene_ids = []
gene_functions = []
# Ecol_protein.faaファイルを読み込み、ヘッダー行から情報を抽出
with open(Ecol_protein_file, 'r') as file:
for line in file:
if line.startswith('>'):
parts = line.strip().split(' ', 1)
gene_id = "Ecol_" + parts[0][1:] # ">"を取り除き、"Ecol_"を追加
parts2 = parts[1].split(' [')
gene_function = parts2[0] # 遺伝子機能
gene_ids.append(gene_id)
gene_functions.append(gene_function)
# 遺伝子IDと遺伝子機能のデータフレームを作成
data = {'GeneID': gene_ids, 'GeneFunction': gene_functions}
df = pd.DataFrame(data)
# マージ
merged_df = pd.merge(
merged_df,
df,
left_on='Ecoli',
right_on='GeneID',
how='left'
)
# 不要なGeneID列を削除し、列名をEcol_GeneFunctionに変更
merged_df = merged_df.drop(columns=['GeneID'])
merged_df = merged_df.rename(columns={'GeneFunction': 'Ecol_GeneFunction'})
# 確認のため最初の数行を表示
print("Ecol")
print(merged_df)
####################################################################
###Tcas
# Tcas_protein.faaのパス
Tcas_protein_file = 'Tcas_protein.faa'
# 遺伝子IDと遺伝子機能を抽出するリスト
gene_ids = []
gene_functions = []
# Tcas_protein.faaファイルを読み込み、ヘッダー行から情報を抽出
with open(Tcas_protein_file, 'r') as file:
for line in file:
if line.startswith('>'):
parts = line.strip().split(' ', 1)
gene_id = "Tcas_" + parts[0][1:] # ">"を取り除き、"Tcas_"を追加
parts2 = parts[1].split(' [')
gene_function = parts2[0] # 遺伝子機能
gene_ids.append(gene_id)
gene_functions.append(gene_function)
# 遺伝子IDと遺伝子機能のデータフレームを作成
data = {'GeneID': gene_ids, 'GeneFunction': gene_functions}
df = pd.DataFrame(data)
# マージ
merged_df = pd.merge(
merged_df,
df,
left_on='Tcastaneum',
right_on='GeneID',
how='left'
)
# 不要なGeneID列を削除し、列名をTcas_GeneFunctionに変更
merged_df = merged_df.drop(columns=['GeneID'])
merged_df = merged_df.rename(columns={'GeneFunction': 'Tcas_GeneFunction'})
# 確認のため最初の数行を表示
print("Tcas")
print(merged_df)
####################################################################
###Sory
# Sory_protein.faaのパス
Sory_protein_file = 'Sory_protein.faa'
# 遺伝子IDと遺伝子機能を抽出するリスト
gene_ids = []
gene_functions = []
# Sory_protein.faaファイルを読み込み、ヘッダー行から情報を抽出
with open(Sory_protein_file, 'r') as file:
for line in file:
if line.startswith('>'):
parts = line.strip().split(' ', 1)
gene_id = "Sory_" + parts[0][1:] # ">"を取り除き、"Sory_"を追加
parts2 = parts[1].split(' [')
gene_function = parts2[0] # 遺伝子機能
gene_ids.append(gene_id)
gene_functions.append(gene_function)
# 遺伝子IDと遺伝子機能のデータフレームを作成
data = {'GeneID': gene_ids, 'GeneFunction': gene_functions}
df = pd.DataFrame(data)
# マージ
merged_df = pd.merge(
merged_df,
df,
left_on='Soryzae',
right_on='GeneID',
how='left'
)
# 不要なGeneID列を削除し、列名をSory_GeneFunctionに変更
merged_df = merged_df.drop(columns=['GeneID'])
merged_df = merged_df.rename(columns={'GeneFunction': 'Sory_GeneFunction'})
# 確認のため最初の数行を表示
print("Sory")
print(merged_df)
#########################################################
merged_df_with_function = merged_df.reindex(columns=['Madara',\
'Ecoli','Ecol_GeneFunction',\
'Dmelanogaster', 'Dmel_GeneFunction',\
'Tcastaneum', 'Tcas_GeneFunction',\
'Soryzae', 'Sory_GeneFunction'])
# 新しいデータフレームをCSVファイルとして保存
merged_df_with_function.to_csv(output_file, index=False)
# 新しいデータフレームをTXTファイルとして保存
merged_df_with_function.to_csv(output_file2, sep='\t', index=False) # タブ文字を区切り文字として使用
# 確認のため最初の数行を表示
print(merged_df_with_function)0709
コフキゲノムのBRAKER終了
~/tools/for_braker/Kohuki/braker/ディレクトリが構築され、結果が出力された。 一方でkohuki_braker.sh.o26238954には以下のWARNING MESSAGEが出力された。
start at
Wed Jul 3 01:40:00 JST 2024
# Wed Jul 3 01:40:37 2024: Log information is stored in file /lustre7/home/kosukesano/tools/for_braker/Kohuki/braker/braker.log
#*********
# WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1413
file /lustre7/home/kosukesano/tools/for_braker/Kohuki/braker/genome.fa contains a highly fragmented assembly (2372896 scaffolds). This may lead to problems when running AUGUSTUS via braker in parallelized mode. You set --threads=16. You should run braker.pl in linear mode on such genomes, though (--threads=1).
#*********contigが細かすぎる?
threads=1にしたコフキゲノムのBRAKER
~/tools/for_braker/Kohuki_thread_oneを作成、以下のスクリプトをジョブとして投げた。
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/kohuki_softmasked.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--threads=1\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
echo end at
date0716
threads=1にしたコフキゲノムのBRAKER結果
変わらず最後まで出力されていない?
kosukesano@at139:~/tools/for_braker/Kohuki_thread_one$ ls
braker kohuki_braker.sh kohuki_braker.sh.e26250715 kohuki_braker.sh.o26250715 kohuki_braker.sh.pe26250715 kohuki_braker.sh.po26250715
kosukesano@at139:~/tools/for_braker/Kohuki_thread_one$ ls braker/
GeneMark-EP GeneMark-ES GeneMark-ES.stdout braker.log errors gc_content.out genome.fa genome_header.map proteins.fa species what-to-cite.txtフェモラータゲノムのソフトマスク続き
kosukesano@at139:~/tools/for_softmask/Sfemorata_pilon_softmask$ ls RM_76722.MonJul80922572024/
consensi.fa consensi.fa.classified families-classified.stk families.stk round-1 round-2 round-3 round-4 round-5 round-6 tmpConsensi.faRepeatModelerは無事できた。続いてRepeatMaskerに移る。
### Sfem_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatMasker -pa 6 -lib ~/tools/for_softmask/Sfemorata_pilon_softmask/RM_*/consensi.fa.classified /home/kosukesano/tools/for_softmask/nama_data/../nama_data/Sfem_pilon/assembly.pilon.fasta
date0718
BRAKER実行結果の確認
seqkitでの確認をしてなかったので改めて確認
### マダラゲノム(RNA_seqデータ含)
kosukesano@at137:~/tools/for_braker/Madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 16,570 8,790,187 5 530.5 20,186
######################################################################手動での系統樹作成
前準備として~/tools/for_orthofinder/make_philo_treeの下にManualPhylo_1.pyを作成、実行した。
###ManualPhylo_1の中身
##analysis_manual.pptxの#46も参照
##AFTER you made MSA file(all_seq.fa) in DDBJ with makeMSA.sh
##時間は10secほど
import numpy as np
import pandas as pd
import os
path = "~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/"
withpath = "../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/"
OGs = pd.read_table(path + "Orthogroups/Orthogroups.tsv")
##with openは相対パスしか受け付けないらしい
new = pd.DataFrame()
with open(withpath + "Orthogroups/Orthogroups_SingleCopyOrthologues.txt", "r") as fin:
for line in fin:
li = line.rstrip()
new = pd.concat([new, OGs[OGs["Orthogroup"] == li]])
print(new)
new.to_csv(path + "ManualPhylo_data/OG_list.txt", sep = " ", index = False, header = False)
##OG_list.txtと同じ順番の種名リストであるspecies_list.txtを作成
##できたOG_list.txtに、DDBJで作ったall_seq.faで配列情報を与える。
li = []
allspe = OGs.columns.tolist()
allspe2 = allspe[1:len(allspe)]
with open(withpath + "ManualPhylo_data/species_list.txt", "w") as file:
for column_name in allspe2:
file.write("%s\n" % column_name)実行時のコマンド
kosukesano@at137:~/tools/for_orthofinder/make_philo_tree$mkdir ../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data
kosukesano@at137:python ManualPhylo_1.py先にManualPhylo_dataディレクトリを作っておかないとうまくいかない。
続いて~/tools/for_orthofinder/make_philo_treeの下にManualPhylo_2.pyを作成、実行した。
###ManualPhylo_2の中身
##ManualPhylo_1.pyの続き
import sys
from Bio import SeqIO
path = "../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data/"
fasta_in = sys.argv[1] #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2] #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する
for q in open(query_in, "r"): #オーソログファイルを開いて1行づつ読み込む
query = q.split() #スペース毎に切りとってリスト形式でqueryに保存する
f = open(path + query[0], 'w') #最初の列(OG名)と同じ名前のファイルを作成する
for record in SeqIO.parse(fasta_in, 'fasta'): #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
id_part = record.id #fastaのID部分を読み込む
desc_part = record.description #fastaのdescription部分を読み込む
seq = record.seq #fastaの配列部分を読み込む
for i in range(len(query)): #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
if desc_part == query[i] : #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
fasta_seq = '>' + desc_part + '\n' + seq + '\n' #fasta形式に整え
print(fasta_seq) #標準出力にfastaを出力(進行状況把握用)
f.write(str(fasta_seq)) #各OGファイルにfastaを出力
f.close()
##できたOGファイルは、align.shやOG_list.txtと同じ場所に
##align.shのある場所までいき、作動。cwdを231016/ManualPhylo_dataにしないとtrimalが作動せず、イライラ実行時のコマンド
kosukesano@at138:~/tools/for_orthofinder/make_philo_tree$ python ManualPhylo_2.py output_directory/all_seq.fa ../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data/OG_list.txt 続く解析で使用するMAFFTとtrimalをインストールする。そのために新規mamba環境を作成。
~/tools/pyenv_env/にManualPhilo_profileを作成。
###ManualPhilo_profileの中身
source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
. "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
else
export PATH="/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
fi
fi
unset __conda_setup
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
. "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<
conda activate MPTこれをすぐ実行すると、(MPT環境がまだできていないため)mambaのbase環境が立ち上がる。この状態でMPT環境を作成する。
(MPT) kosukesano@at138:~/tools/for_MAFFT$ mamba install -c bioconda -y mafft
__ __ __ __
/ \ / \ / \ / \
/ \/ \/ \/ \
███████████████/ /██/ /██/ /██/ /████████████████████████
/ / \ / \ / \ / \ \____
/ / \_/ \_/ \_/ \ o \__,
/ _/ \_____/ `
|/
███╗ ███╗ █████╗ ███╗ ███╗██████╗ █████╗
████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
██╔████╔██║███████║██╔████╔██║██████╔╝███████║
██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
██║ ╚═╝ ██║██║ ██║██║ ╚═╝ ██║██████╔╝██║ ██║
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝
mamba (1.1.0) supported by @QuantStack
GitHub: https://github.com/mamba-org/mamba
Twitter: https://twitter.com/QuantStack
█████████████████████████████████████████████████████████████
Looking for: ['mafft']
bioconda/linux-64 5.6MB @ 3.4MB/s 1.8s
bioconda/noarch 5.3MB @ 2.9MB/s 2.0s
conda-forge/noarch 18.0MB @ 6.6MB/s 3.2s
conda-forge/linux-64 42.6MB @ 7.5MB/s 6.7s
Transaction
Prefix: /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/envs/MPT
Updating specs:
- mafft
Package Version Build Channel Size
───────────────────────────────────────────────────────────────────────────────
Install:
───────────────────────────────────────────────────────────────────────────────
+ _libgcc_mutex 0.1 conda_forge conda-forge/linux-64 Cached
+ _openmp_mutex 4.5 2_gnu conda-forge/linux-64 Cached
+ gawk 5.3.0 ha916aea_0 conda-forge/linux-64 Cached
+ gettext 0.22.5 h59595ed_2 conda-forge/linux-64 475kB
+ gettext-tools 0.22.5 h59595ed_2 conda-forge/linux-64 3MB
+ gmp 6.3.0 hac33072_2 conda-forge/linux-64 460kB
+ libasprintf 0.22.5 h661eb56_2 conda-forge/linux-64 43kB
+ libasprintf-devel 0.22.5 h661eb56_2 conda-forge/linux-64 34kB
+ libgcc-ng 14.1.0 h77fa898_0 conda-forge/linux-64 842kB
+ libgettextpo 0.22.5 h59595ed_2 conda-forge/linux-64 171kB
+ libgettextpo-devel 0.22.5 h59595ed_2 conda-forge/linux-64 37kB
+ libgomp 14.1.0 h77fa898_0 conda-forge/linux-64 457kB
+ libstdcxx-ng 14.1.0 hc0a3c3a_0 conda-forge/linux-64 4MB
+ mafft 7.525 h031d066_1 bioconda/linux-64 3MB
+ mpfr 4.2.1 h9458935_1 conda-forge/linux-64 643kB
+ ncurses 6.5 h59595ed_0 conda-forge/linux-64 887kB
+ readline 8.2 h8228510_1 conda-forge/linux-64 Cached
Summary:
Install: 17 packages
Total download: 14MB
───────────────────────────────────────────────────────────────────────────────
libgettextpo 170.6kB @ 703.9kB/s 0.2s
libgomp 456.9kB @ 1.7MB/s 0.3s
libgcc-ng 842.1kB @ 3.0MB/s 0.3s
libasprintf-devel 34.2kB @ 119.1kB/s 0.1s
libstdcxx-ng 3.9MB @ 12.2MB/s 0.3s
gettext 475.1kB @ 1.4MB/s 0.1s
mpfr 643.1kB @ 1.8MB/s 0.1s
ncurses 887.5kB @ 2.4MB/s 0.1s
libasprintf 43.2kB @ 115.9kB/s 0.1s
libgettextpo-devel 36.8kB @ 90.5kB/s 0.1s
gettext-tools 2.7MB @ 6.6MB/s 0.4s
gmp 460.1kB @ 1.0MB/s 0.1s
mafft 3.5MB @ 7.7MB/s 0.1s
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
(MPT) kosukesano@at138:~/tools/for_MAFFT$ ls
(MPT) kosukesano@at138:~/tools/for_MAFFT$ ls -a
. ..
(MPT) kosukesano@at138:~/tools/for_MAFFT$ mafft0719
ASTRALの前準備
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data以下でmakealltree.shを作成した。
### makealltree.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"
# 作業ディレクトリに移動
cd ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/ManualPhylo_data
# 出力ファイル
output_file="all_trees.nwk"
# 既存の出力ファイルを削除
if [ -f $output_file ]; then
rm $output_file
fi
# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.edit.fa; do
# ファイル名から拡張子を除いたベース名を取得
base_name=$(basename $file .maffted.trimed.edit.fa)
# Singularityを使用してIQ-TREEを実行して系統樹を作成
singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}
# 作成された系統樹ファイル (.treefile) を output_file に追加
if [ -f ${base_name}.treefile ]; then
echo -n "${base_name}: " >> $output_file
cat ${base_name}.treefile >> $output_file
echo "" >> $output_file
else
echo "Error: ${base_name}.treefile not found" >&2
fi
done
echo "All trees have been written to $output_file"
date0722追記。これを作業ノードで実行したら終わらずタイムアウト。
0722
ASTRAL前準備
makealltree.shをqsubで投げた。
フェモラータ新規ゲノムのソフトマスク続き
RepeatMaskerはできた。続いてProcessRepeatsに移る。
### Sfem_ProcessRepeats.sh
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
ProcessRepeats -maskSource ~/tools/for_softmask/nama_data/Sfem_pilon/assembly.pilon.fasta -xsmall -gff ~/tools/for_softmask/nama_data/Sfem_pilon/assembly.pilon.fasta.cat.gz
dateこれをqsubで投げた。
0724
ASTRAL実行
makealltree.shが終わったので出力ファイルを使ってラン
makealltree.sh出力のall_trees.nwkは/home/kosukesano/tools/for_ASTRAL/Astral/dataに格納した。
kosukesano@at139:~/tools/for_ASTRAL/Astral$ java -jar astral.5.7.8.jar -i data/all_trees.nwk -o 240724_result/out.tre 2> 240724_result/out.log
kosukesano@at139:~/tools/for_ASTRAL/Astral$ ls 240724_result/
out.log
kosukesano@at139:~/tools/for_ASTRAL/Astral$### /240724_result/out.logの中身
================== ASTRAL =====================
This is ASTRAL version 5.7.8
Gene trees are treated as unrooted
Exception in thread "main" java.lang.RuntimeException: Failed to Parse Tree number: 1
at phylonet.coalescent.CommandLine.readInputTrees(CommandLine.java:813)
at phylonet.coalescent.CommandLine.readOptions(CommandLine.java:321)
at phylonet.coalescent.CommandLine.main(CommandLine.java:486)
Caused by: phylonet.tree.io.ParseException: Number expected
at phylonet.tree.io.NewickReader.readNode(NewickReader.java:428)
at phylonet.tree.io.NewickReader.readTree(NewickReader.java:374)
at phylonet.tree.io.NewickReader.readTree(NewickReader.java:95)
at phylonet.coalescent.CommandLine.readInputTrees(CommandLine.java:780)
... 2 more0725
ASTRALのラン終了
240724_result/out.logにアウトプットファイルが出力された。
### out.logの中身
================== ASTRAL =====================
This is ASTRAL version 5.7.8
Gene trees are treated as unrooted
Exception in thread "main" java.lang.RuntimeException: Failed to Parse Tree number: 1
at phylonet.coalescent.CommandLine.readInputTrees(CommandLine.java:813)
at phylonet.coalescent.CommandLine.readOptions(CommandLine.java:321)
at phylonet.coalescent.CommandLine.main(CommandLine.java:486)
Caused by: phylonet.tree.io.ParseException: Number expected
at phylonet.tree.io.NewickReader.readNode(NewickReader.java:428)
at phylonet.tree.io.NewickReader.readTree(NewickReader.java:374)
at phylonet.tree.io.NewickReader.readTree(NewickReader.java:95)
at phylonet.coalescent.CommandLine.readInputTrees(CommandLine.java:780)
... 2 more何かエラーを吐いている?
フェモラータ新規ゲノムのソフトマスク完了
RepeatMaskerの結果が返ってきた
kosukesano@at139:~/tools/for_softmask/nama_data/Sfem_pilon$ ls
Sfem_ProcessRepeats.sh Sfem_ProcessRepeats.sh.pe26282981 assembly.pilon.fasta.cat.gz assembly.pilon.fasta.out.gff
Sfem_ProcessRepeats.sh.e26282981 Sfem_ProcessRepeats.sh.po26282981 assembly.pilon.fasta.masked assembly.pilon.fasta.preMonJul220243592024.RMoutput
Sfem_ProcessRepeats.sh.o26282981 assembly.pilon.fasta assembly.pilon.fasta.maskedが目的の産物。これを~/tools/for_braker/nama_dataにSfem_pilon_softmasked.fastaとしてコピーした。
フェモラータ新規ゲノムのBRAKER
~/tools/for_brakerにFemo_pilonを作成し、その下でfemo_braker.shを作成。
### femo_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Sfem_pilon_softmasked.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=Sfem-1_1,femo-larva_1,femo_H1_1,femo_H3_1,femo_L1_1,femo_L3_1,femo_O1_1,femo_O3_1,femo_T1_1,femo_T3_1,Sfem-1_2,femo-larva_2,femo_H1_2,femo_H3_2,femo_L1_2,femo_L3_2,femo_O1_2,femo_O3_2,femo_T1_2,femo_T3_2,femo-female_1,femo-male_1,femo_H2_1,femo_H4_1,femo_L2_1,femo_L4_1,femo_O2_1,femo_O4_1,femo_T2_1,femo_T4_1,femo-female_2,femo-male_2,femo_H2_2,femo_H4_2,femo_L2_2,femo_L4_2,femo_O2_2,femo_O4_2,femo_T2_2,femo_T4_2 \
--rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Sfem_RNAseq\
--threads=16\
--species=Sfemorata_pilon\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
dateこれをqsubで投げた。
0729
フェモラータ新規ゲノムのBRAKER完了
BRAKERは無事動作し、ファイルが出力された。
kosukesano@at137:~/tools/for_braker/Femo_pilon$ ls
braker femo_braker.sh.e26283956 femo_braker.sh.o26283956 femo_braker.sh.pe26283956 femo_braker.sh.po26283956
femo_braker.sh femo_braker.sh.e26283961 femo_braker.sh.o26283961 femo_braker.sh.pe26283961 femo_braker.sh.po26283961
kosukesano@at137:~/tools/for_braker/Femo_pilon$ cd braker/
kosukesano@at137:~/tools/for_braker/Femo_pilon/braker$ ls
Augustus GeneMark-ETP braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff species what-to-cite.txt
kosukesano@at137:~/tools/for_braker/Femo_pilon/braker$ フェモラータ新規ゲノムのBUSCO
BRAKER後の出力ファイルについて、BUSCOを用いてクオリティチェックを行う。前回のフェモラータゲノムはBRAKER3での出力ファイルのクオリティ値が低かったため、高くなっているといいな。
### femo_busco.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
echo start at
date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
-m protein\
-i /home/kosukesano/tools/for_braker/Femo_pilon/braker/braker.aa\
-o BUSCO_OUTPUT_FEMO_WITHRNA\
-l\
/home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
-f
dateこれをqsubで投げた。
CDS配列を用いたOrthofinder
PAML用にCDS配列のみで6種のOrthofinderを行った。まず~/tools/for_orthofinder/にSmad_Agra_Cass_Dpon_Sory_Tcas_CDS_dirディレクトリを作成し、6種の CDS配列をコピーした。
kosukesano@at137:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir$ ls
Agra.fna Cass.fna Dpon.fna Smad.fna Sory.fna Tcas.fna
kosukesano@at137:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir$これをもとに下記シェルスクリプトを記述、実行した。
### Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
-f ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir\
-t 5\
-a 5\
-d
dateASTRALの実行
all_trees.nwkのOG番号が悪さをしているのでは?そこを切り取るコードを書き、実行。
### modify.pyの中身
# 元のファイルと新しいファイルのパスを設定
input_file_path = 'all_trees.nwk'
output_file_path = 'modified_trees.nwk'
# 元のファイルを開いて処理
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
for line in infile:
# 行を ': ' で分割し、要素が2つ以上の場合のみ処理
parts = line.split(': ', 1)
if len(parts) > 1:
modified_line = parts[1]
# 新しいファイルに書き込み
outfile.write(modified_line)これを以下の通り実行。
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ python modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
all_trees.nwk modified_trees.nwk modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$出力結果は以下の通り
### modified_trees.nwkの中身
(Agra:0.0635712699,Cass:0.0805557052,(Dpon:0.1219665493,(Smad:0.0432864630,(Sory:0.0817207751,Tcas:0.1397248588)70:0.0168256619)38:0.0053674344)54:0.0170031209);
(Agra:0.1655023482,((Cass:0.2728454655,Dpon:0.4276770145)67:0.0351066692,Smad:0.1218709603)53:0.0390916641,(Sory:0.1302322053,Tcas:0.8475027507)100:0.2395842901);
(Agra:0.1498975617,(Cass:0.1228354087,(Dpon:0.1591568837,(Sory:0.1686172722,Tcas:0.6188635095)85:0.0775748177)61:0.0269975353)69:0.0263101100,Smad:0.2004686939);
(Agra:0.1046182209,(Cass:0.1173390835,(Dpon:0.1265307000,Smad:0.0652513950)81:0.0262711442)56:0.0151016525,(Sory:0.0943592513,Tcas:0.3993841186)47:0.0458834130);
(Agra:0.4439818619,((Cass:0.3270195555,Smad:0.3502005062)69:0.0801526161,Dpon:0.5246458008)67:0.0745695349,(Sory:0.4691251648,Tcas:0.5980869091)96:0.3513668760);
(Agra:0.0823664659,(((Cass:0.0603291137,Sory:0.1007744717)54:0.0286917972,Dpon:0.1413255434)38:0.0325222870,Smad:0.0574125897)42:0.0394349257,Tcas:0.2854548697);
(Agra:0.6430510957,Cass:0.4001509671,((Dpon:0.4706496916,Smad:0.4285065087)45:0.0605668934,(Sory:0.4005216201,Tcas:0.7245661234)52:0.0659584460)55:0.0425434177);
(Agra:0.1849238595,(((Cass:0.0720240778,Dpon:0.1265536083)47:0.0193627993,Smad:0.1655456501)18:0.0000020169,Sory:0.1467750516)66:0.0448440021,Tcas:0.2867528256);
(Agra:0.1120432096,(Cass:0.0451799005,Smad:0.0279681266)82:0.0126641184,(Dpon:0.0822097283,(Sory:0.1540588110,Tcas:0.2053968900)98:0.0531593726)83:0.0129938791);
(Agra:0.2307249419,(Cass:0.1422763867,Smad:0.2341206100)49:0.0515095597,(Dpon:0.3216560906,(Sory:0.3535060629,Tcas:0.8600170053)43:0.1084091415)27:0.0133982920);
(Agra:0.0816817581,(Cass:0.1560371473,Smad:0.2263294156)46:0.0450279441,(Dpon:0.1543923964,(Sory:0.1934776920,Tcas:0.5730013110)61:0.0810103641)55:0.0484005680);
(Agra:0.1290642034,(Cass:0.1201414309,Dpon:0.2097540535)34:0.0033281012,(Smad:0.1322768450,(Sory:0.2402523393,Tcas:0.2655477108)89:0.1409222511)85:0.0790612548);
(Agra:0.0774654965,(((Cass:0.0089173938,Dpon:0.0421759868)46:0.0096013194,Sory:0.0138448697)39:0.0179560775,Smad:0.0256089159)77:0.0357100139,Tcas:0.0318576291);
(Agra:0.0805531475,(Cass:0.0617472257,(Dpon:0.0430911645,Smad:0.0471918037)65:0.0138789643)98:0.0342890105,(Sory:0.0783736246,Tcas:0.1968719542)100:0.0780188258);
(Agra:0.2384458487,Cass:0.3002711919,((Dpon:0.2187183648,(Sory:0.2390630242,Tcas:0.7571742224)58:0.0576932185)41:0.0296961138,Smad:0.1436197371)61:0.0378697622);
(Agra:0.0791366226,((Cass:0.0624743725,(Dpon:0.0847100902,Sory:0.1212750817)52:0.0223211948)44:0.0168742678,Smad:0.0311081778)64:0.0315535327,Tcas:0.2833860188);こちらのファイルを指定してASTRAL.shを投げた。
0730
PAML前準備
~/tools/for_pamlディレクトリに/6sp/data/SCOディレクトリを作成した。
Orthofinder出力のOrthogroups.txtからシングルコピーオーソログのみを抽出する。実行スクリプトは以下の通り。
### ExOG.pyの中身
# ファイルパスの設定
orthogroups_file_path = '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups.txt'
single_copy_orthologues_file_path = '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups_SingleCopyOrthologues.txt'
output_file_path = '/home/kosukesano/tools/for_paml/6sp/data/extracted_orthogroups.txt'
# シングルコピーオルソログのIDをセットに格納
single_copy_orthologues = set()
with open(single_copy_orthologues_file_path, 'r') as single_copy_file:
for line in single_copy_file:
single_copy_orthologues.add(line.strip())
# Orthogroups.txt から該当する行を抽出して新しいファイルに保存
with open(orthogroups_file_path, 'r') as orthogroups_file, open(output_file_path, 'w') as output_file:
for line in orthogroups_file:
# 行の最初の部分を取り出してIDをチェック
og_id = line.split(':')[0].strip()
if og_id in single_copy_orthologues:
output_file.write(line)これを作業ノードで実行した。
続いて上記スクリプトの出力であるextracted_orthogroups.txtを参照に、各オーソログのprotein ID に対応するCDSをそれぞれのCDSファイルから取り出し、個別のファイルとして格納する。実行スクリプトは以下の通り。
### makefna.py
import os
# 入力ファイルと出力ディレクトリのパスを設定
extracted_orthogroups_path = '/home/kosukesano/tools/for_paml/6sp/data/extracted_orthogroups.txt'
cds_dir = '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir'
output_dir = '/home/kosukesano/tools/for_paml/6sp/data/SCO'
# ファイルの拡張子を変数に
file_extension = ".fna"
# 種ごとのファイル名マッピング
species_to_file = {
"Cass": "Cass.fna",
"Tcas": "Tcas.fna",
"Dpon": "Dpon.fna",
"Sory": "Sory.fna",
"Agra": "Agra.fna",
"Smad": "Smad.fna"
}
# 必要な出力ディレクトリを作成
os.makedirs(output_dir, exist_ok=True)
# `extracted_orthogroups.txt`を読み込み、各オーソログに対して処理
with open(extracted_orthogroups_path, 'r') as infile:
for line in infile:
columns = line.strip().split()
orthogroup_id = columns[0].replace(':', '')
protein_ids = columns[1:]
output_file_path = os.path.join(output_dir, f"{orthogroup_id}.fna")
with open(output_file_path, 'w') as outfile:
for i, protein_id in enumerate(protein_ids):
species = list(species_to_file.keys())[i]
cds_file_path = os.path.join(cds_dir, species_to_file[species])
# `seqkit grep`コマンドを構築して実行
grep_command = f"singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit grep -r -p '{protein_id}' {cds_file_path} >> {output_file_path}"
os.system(grep_command)これを作業ノードで実行した。2時間くらいかかった。
結果として、~/tools/for_paml/6sp/data/SCOにSCOのCDS配列ファイルが出力された。
kosukesano@at137:~/tools/for_paml/6sp/data/SCO$ ls
OG0008033.fna OG0008221.fna OG0008418.fna OG0008607.fna OG0008790.fna OG0008975.fna OG0009160.fna OG0009346.fna OG0009521.fna OG0009707.fna OG0009888.fna
OG0008034.fna OG0008224.fna OG0008419.fna OG0008609.fna OG0008791.fna OG0008976.fna OG0009161.fna OG0009347.fna OG0009523.fna OG0009708.fna OG0009889.fna
OG0008035.fna OG0008225.fna OG0008420.fna OG0008610.fna OG0008792.fna OG0008977.fna OG0009162.fna OG0009349.fna OG0009525.fna OG0009709.fna OG0009890.fna
OG0008036.fna OG0008226.fna OG0008421.fna OG0008611.fna OG0008794.fna OG0008978.fna OG0009163.fna OG0009350.fna OG0009526.fna OG0009710.fna OG0009892.fna
OG0008037.fna OG0008227.fna OG0008423.fna OG0008612.fna OG0008795.fna OG0008979.fna OG0009164.fna OG0009351.fna OG0009527.fna OG0009712.fna OG0009894.fna
OG0008039.fna OG0008228.fna OG0008425.fna
.
.
.
.
.
.### OG0008965.fnaの中身
>lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CEUTPL_LOCUS9995] [protein_id=CAG9769486.1] [location=complement(join(35353935..35354250,35354441..35354709))] [gbke
y=CDS]
ATGGCTTATTTTCATAAACTAGGTCACCGATTTTTCACAACAAAAGCAATCCAAAACTGG
AATTCCAAAAGTGAGAAATTCAATGAGAAAATAAAGGGAACCATAGTTGAAAAATGGGTA
AAATATTGGAAACTTGTTACCAAAGATTACAAGGAAGTAGGTTTATCTGTTAAACAAGAA
ATTAAAGACAAACCTCTAAAAACTATTGTGTATTTTACTGGAGCAGCTTTATTCGGTTTG
TGTTGGGAGTTAAACCCAAACTTGCAAAGTTTCAGGGCAACATATATAGCATCAGCCAAC
GATCTAAGTTTAGTACCTCTTACTCTAGCGAACCCAAATTCAGTAGAACATTTAAAACAC
ATTGAACAATGTTTCAATCGAAAATATATAAGATATACAAACCTTGGACTTTTATCATTA
ATATGGGTAGATAAATTTAGTGAAGAATGCGATTTATATGAGAGCAACTGCTCATACCTT
AAAGTTCCTTTTTATAAAATAACAGGAAGAATTTTAGATGTTGGCTTTCTAAATGTATGG
TGGATTATTTCTAGAAGAATGCTCGATTATGATATAAATTATTAG
>lcl|NC_007417.3_cds_XP_008201558.1_3086 [db_xref=GeneID:103315214] [protein=uncharacterized protein C19orf52] [protein_id=XP_008201558.1] [location=complement(11429290..11429874)] [gbkey=CDS]
ATGTTGCGATTATCGGGGTTTAATGTTTTTGCTCTTGGAGCAAAAACTATCGAAAATTAT
AAAAAAGCAAGCGAGAGGATCAATAAGAAGATTAGCGGAACTTTCGTTGAAAAAGCAGTC
ATTTATTTGAAAACTGTATGGAATGACTATACCGAAGTAGCTGTCTCTGTTAGGAGCGAC
ATTACGGAGAAACCCCTAAAAGCGGCGGGTTTCTTCACCGGTATGGGCTTCGTAATGTAC
AGTTTAACACACAATCCGGACGAACAAAGTTTCAAAGCGAAATTTATCCAGTGTTCAAAT
GAGGTTTCTTTAGTTAGCCCAAATCTTGTCAATACCGCTGCAGTTGAACACATGAAGATG
ATACAAACTTGTTACAACAGAGACTTAATAAGGTACACAAACTTGGGGCTGTTTTCACTC
GTTTGGGTCGATAAATACAGTGATCAGTGCAACATGTACGAAACAAACTGTTCTTATTTG
CAACTGCCATACAGGAAATTCCCTAGTCACGTCATAGATGTAGGTTTTTTGAATATTTGG
TGGGTCATATCGCGCAAAATGTTAGATTATGACATAAATTATTAA
>lcl|NW_026017110.1_cds_XP_019769948.1_2611 [gene=LOC109544293] [db_xref=GeneID:109544293] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_019769948.1] [location=complement(join(25238952..25239267,25239322..25239623))] [gbkey=CDS]
ATGTATTTGAAAAATATTGTAAATCAGTCGAAAATGATGAAAATAAATGAACCCCAGTCC
AGATTTTTCACCACGCGTGCGTTGGAAAACTGGAAAACCACCAGTGAAAAGTTTAACGAA
AAAATTAAAGGGACGATTCTAGAAAAATGGGTGAAATATTGGAAAGTTGTGGCCAAAGAC
TACCAAGACGTCGCACTTAATGTGAAACAGGAAATTAAGCAAAAACCTTTAAAATCAACT
GTGTTTTTCACCGGGTCTGCTTTTTTAGGGCTATGCCTGCATCTAAATCCTGATCTAAAA
AGTTTTAGGTCGAAGTACATCGAATCAGCCAACAATTTAAGTTTAGTGCCACTGACGCTG
GCAAATCCAAGATCCGTAGAACATTTAAAGCACATCGAAAGATGTTTCAATCGTAAATTC
ATTCGCTATCAAAACCTGGGATTATTTTCAATTATGTGGGTAGACAAACGTAGTAAGGAG
TGCGATTCATATGAAAGCAACTGTTCATATTTAAAGGTTCCATTTTGGAATGTTAGCAGC
CGAATTTTAGACGTAGGCTTTTTGAATGTATGGTGGATTATTTCAAGGCAGATGCTAGAT
TATGATATTAATTATTAA
>lcl|NW_022146411.1_cds_XP_030754725.1_11873 [gene=LOC115881405] [db_xref=GeneID:115881405] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_030754725.1] [location=complement(join(1076167..1076482,1076551..1076813))] [gbkey=CDS]
ATGGTCCAATATAATCAGTCACTGTCACTAATATCTCGAACACTAGTCAATATAAAATCC
ACAAATCTTAAGTTTAATGAGAAAATAAAAGGTACCATAGTAGAAAAATGGGTGGCTTAT
TGGAAACTAGTGGCAAAAGATTATAAAGATGTTGGAAGATCATTAAAACAAGATATAAAA
ACAAAACCATTGCGATCTGGTTTATATTTTACAGGTGCAAGTTTGCTAGGACTTTGTGCA
TCTTTAAACCCCGATATGCAAAGTTTTAGAGCAAAATATATTCAATCTGCAAATGATTTA
GGGTTAGTTCCTACTACACTAGCTAACCCTCAAGCCTTAAATCATTTAAAATATATTGAG
AGAAGCTTTAATCACAACCTTATTCGTTACATAAATTTAGGTGTTTTATCGATAATCTGG
GTGGACAAATTTAGTGAAGATTGTAATTTATATGAAAATACTTGTTCTTATCTTCAAGTC
CCATTTTGGGAAATTAGAAAGAGAATGCTTGATATAGGATTTTTAAATGTATGGTGGATA
ACATCCAGAAAAATGCTTGACTATGATATAAATTATTAA
>lcl|NC_065546.1_cds_XP_050302974.1_1493 [gene=LOC126740812] [db_xref=GeneID:126740812] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_050302974.1] [location=join(50475192..50475457,50483767..50484082)] [gbkey=CDS]
ATGTTGAAGGTATGCAAGAGATTTTACAGTAGTCCAGTAACAGGGAATTCGAACTGGCAG
ACTGTGAGCCATAAGTTTAATCAAAAAATCAAAGGCACTTTCTTGGAAAAATGGGCGAAG
TTTTGGAAAACTGTCGCCAAAGACTATAAAGAAGTTGCCATAAATGTGAAACAGGATATA
AAACAGAAACCATTAAAGGCTGCCGCATACTTCAGTGCATCTGCCTTTGTTGGATTGTGC
ATTCAATTCAACCCAGATTTGCAAAGTTTCAGATCAAAATATGTCCAATCAGCAAATGAA
GTAGGTTTGGTACCTCTTAGCCTAACAAATCCACAAGCTGTAGAGCATTTAAATTACATT
GAAAGGTGTTTTAACCAACAGTTAATTAGGTATGTCAACCTAGGAATATTTTCAATAATA
TGGGTGGATAAATTCAGTAAAGAGTGTGACACCTATGAAAGTAAATGCACATACTTGCAA
GTTCCTTACTGGGGTATACCCAGCAGAATATTAGATATAGGATTTTTAAATGTATGGTGG
ATTACATCTAGAAAAATGTTGGATTATGACATAAATTATTAG
>g3079.t1
ATGTATTCACTGAACAAAATAAGTAGGAGGTTACTCACCACCCGAGCACTGGAAAACTTA
AAATCCACAAATGAAAAATTGAACAATAAAATAAAGGGAACATTCATTGAAAAATGGGTA
AAATATTGGAAACTTATAGCTAAAGACTATCAGGATGTCAGCATTTCAGTTAAACAAGAT
ATTAAAGCAAAACCACTGAGGACAATGGCATATTTTACAGGAGCTGCATTCATAGGTTTA
TGCATTGAATTAAATCCAGATCTGCAAAGTTTTAGGGCAAAATACATTGCATCTGCCAAT
GACCTCAGTTTAGTACCTTTACATTTAGCAAATTCACAAGCTGTTGAGCACTTGAAGTAC
GTGGAACGCTGCTTTAATCGCAAATTTATCAGATATATGAATCTTGGAATTGCATCAGTA
GTATGGGTGGATAAATATAGTAGCGAGTGTGACACCTACGAGAGCAACTGTTCTTATTTA
CAAGTACCTTATTGGAATATAACAGACAGAATATTGGACATAGGCTTCCTAAATGTATGG
TGGATTATTTCCAGAAAAATGTTAGATTATGatataaattactagこんな感じ!
続いてこれらのファイルをMAFFTによりアライメントする。実行したシェルスクリプトは以下の通り。
### mafft.sh
#$ -S /bin/bash
source ~/tools/pyenv_env/ManualPhilo_profile
# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO/"
output_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO/"
# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
# 元のファイル名から拡張子を除いたものを取得
base_name=$(basename "$file" .fna)
# 出力ファイル名を生成
output_file="${output_dir}${base_name}_maffted.fna"
# MAFFTを実行
mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"
echo "Aligned file created: $output_file"
doneこれをqsubで投げた。
PAMLのテストラン
~/tools/for_paml/testにて以下のコードを書き、PAMLを実行してみた。
### bsA.ctl
seqfile = /home/kosukesano/tools/for_paml/6sp/data/SCO/OG0008965_maffted.fna
treefile = /home/kosukesano/tools/for_paml/test/data/tree_ultrametric.nwk
outfile = result/OG0008965_branch_alt
noisy = 9 * 0,1,2,3,9: how much rubbish on the screen
verbose = 1 * 1: detailed output, 0: concise output
runmode = 0 * 0: user tree; 1: semi-automatic; 2: automatic
* 3: StepwiseAddition; (4,5):PerturbationNNI
seqtype = 2 * 1:codons; 2:AAs; 3:codons-->AAs
CodonFreq = 2 * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table
clock = 0 * 0: no clock, unrooted tree, 1: clock, rooted tree
model = 2 * 記号の有無で異なる ω を推定
NSsites = 0 * サイト間では ω は一定
fix_omega = 0 * ω の値を配列から推定
omega = 1 * 推定は ω=1 からスタート
icode = 0 * 0:standard genetic code; 1:mammalian mt; 2-10:see below
fix_kappa = 0 * 1: kappa fixed, 0: kappa to be estimated
kappa = 2 * initial or fixed kappa
fix_alpha = 1 * 0: estimate gamma shape parameter; 1: fix it at alpha
alpha = .0 * initial or fixed alpha, 0:infinity (constant rate)
Malpha = 0 * different alphas for genes
ncatG = 4 * # of categories in the dG or AdG models of rates
getSE = 0 * 0: don't want them, 1: want S.E.s of estimates
RateAncestor = 0 * (1/0): rates (alpha>0) or ancestral states (alpha=0)
method = 0 * 0: simultaneous; 1: one branch at a time
fix_blength = 0 * 0: ignore, -1: random, 1: initial, 2: fixed, 3: proportional
* Specifications for duplicating results for the small data set in table 1
* of Yang (1998 MBE 15:568-573).
* see the tree file lysozyme.trees for specification of node (branch) labelsこれを以下のように実行。
kosukesano@at137:~/tools/for_paml/test$ singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml bsA.ctl
15 verbose | verbose 1.00
7 runmode | runmode 0.00
4 seqtype | seqtype 2.00
13 CodonFreq | CodonFreq 2.00
9 clock | clock 0.00
16 model | model 2.00
20 NSsites | NSsites 0.00
26 fix_omega | fix_omega 0.00
27 omega | omega 1.00
22 icode | icode 0.00
24 fix_kappa | fix_kappa 0.00
25 kappa | kappa 2.00
28 fix_alpha | fix_alpha 1.00
29 alpha | alpha 0.00
30 Malpha | Malpha 0.00
31 ncatG | ncatG 4.00
11 getSE | getSE 0.00
12 RateAncestor | RateAncestor 0.00
8 method | method 0.00
37 fix_blength | fix_blength 0.00
AAML in paml version 4.9, March 2015
processing fasta file
reading seq# 1 lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CEUTPL_LOCUS9995] [protein_id=CAG9769486.1] [location=complement(join(35353935..35354250,35354441..35354709))] [gbkey=CDS] 624 sites
reading seq# 2 lcl|NC_007417.3_cds_XP_008201558.1_3086 [db_xref=GeneID:103315214] [protein=uncharacterized protein C19orf52] [protein_id=XP_008201558.1] [location=complement(11429290..11429874)] [gbkey=CDS] 624 sites
reading seq# 3 lcl|NW_026017110.1_cds_XP_019769948.1_2611 [gene=LOC109544293] [db_xref=GeneID:109544293] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_019769948.1] [location=complement(join(25238952..25239267,25239322..25239623) 624 sites
reading seq# 4 lcl|NW_022146411.1_cds_XP_030754725.1_11873 [gene=LOC115881405] [db_xref=GeneID:115881405] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_030754725.1] [location=complement(join(1076167..1076482,1076551..1076813))] 624 sites
reading seq# 5 lcl|NC_065546.1_cds_XP_050302974.1_1493 [gene=LOC126740812] [db_xref=GeneID:126740812] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_050302974.1] [location=join(50475192..50475457,50483767..50484082)] [gbkey=CDS] 624 sites
reading seq# 6 g3079.t1 624 sites
ns = 6 ls = 624
Reading sequences, sequential format..
Reading seq # 1: lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CE
Error in sequence data file: U at 1 seq 1.
Make sure to separate the sequence from its name by 2 or more spaces.
kosukesano@at137:~/tools/for_paml/test$ ls result/
OG0008101_branch_alt OG0008101_bs_alt OG0008768_branch_alt OG0008965_branch_altシーケンスデータファイルの形式に問題がある?
2024年8月
0804
CDSを用いたOrthofinderの結果
ファイルの拡張子が合わない?
標準出力ファイルは以下のようになった。
### Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26291666の中身
start at
Sun Aug 4 00:59:20 JST 2024
OrthoFinder version 2.5.4 Copyright (C) 2014 David Emms
2024-08-04 00:59:25 : Starting OrthoFinder 2.5.4
5 thread(s) for highly parallel tasks (BLAST searches etc.)
5 thread(s) for OrthoFinder algorithm
Checking required programs are installed
----------------------------------------
Test can run "mcl -h" - ok
Test can run "fastme -i /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/OrthoFinder/Results_Aug04/WorkingDirectory/SimpleTest.phy -o /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/OrthoFinder/Results_Aug04/WorkingDirectory/SimpleTest.tre" - ok
WARNING: Files have been ignored as they don't appear to be FASTA files:
Agra.fna
Cass.fna
Dpon.fna
Smad.fna
Sory.fna
Tcas.fna
OrthoFinder expects FASTA files to have one of the following extensions: pep, faa, fa, fas, fasta
ERROR: At least two species are required
ERROR: An error occurred, ***please review the error messages*** they may contain useful information about the problem.
Sun Aug 4 00:59:47 JST 2024
(END)入力ファイルを.fastaに変更。もう一度qsubで投げた。
CAFEの結果とDEG解析の結果を照合する
以下のコードをrで実行した。
### home/bio/for_cafe/caferesult.Rの中身
library(tidyverse)
Deg<-read.csv("Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")
og<-read.csv("Original_data/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups.tsv", sep=":", skip=1)
Plami<-read.csv("old_result/Base_change.tab", sep="\t")
View(Plami)
# ファイルを読み込む
file_path <- "old_result/Base_asr.tre"
lines <- readLines(file_path)
print(lines)
# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", lines))
trees_end <- which(grepl("END;", lines))
trees_lines <- lines[(trees_start + 1):(trees_end - 1)]
# 不要なスペースを削除
trees_lines <- gsub("^\\s+|\\s+$", "", trees_lines)
# データフレームに変換
library(tibble)
trees_df <- tibble(Tree = trees_lines)
ex=trees_df|>###マダラで優位に増減したOGのOG番号を抽出したファイル
#lines|>
tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>
dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>
dplyr::mutate(tree = stringr::str_extract(tree, r"(Smad<0>_)")) |>
dplyr::mutate(tree = tidyr::replace_na(tree, "significant")) |>
dplyr::filter(tree == "significant") |>
print()
View(ex)
#################################################################
Plami2=Plami |>###マダラで増加した0Gの0G番号を抽出したファイル
dplyr::select("FamilyID","Smad.0.") |>
dplyr::mutate(Smad.0. = stringr::str_extract(Smad.0., r"(^\d+)")) |>
tidyr::drop_na()|>
dplyr::filter(Smad.0. != 0) |>
print()
View(Plami2)
#################################################################
df=dplyr::inner_join(Plami2, ex, by = c(FamilyID = "OG_num"))|>###マダラで優位に増加したOGのOG番号を抽出したファイル
print()
##################################################################
# ファイルパスの指定
orthogroups_file <- "Original_data/OrthoFinder/Results_Jun25/Orthogroups/Orthogroups.tsv"
# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim(orthogroups_file, header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")
# データの最初の数行を表示して確認
head(orthogroups)
View(orthogroups)
################################################################
df2=dplyr::left_join(df, orthogroups, by = c(FamilyID = "V1"))|>
dplyr::select(!c(Smad.0., tree)) |>
print()
View(df2)
################################################################
# V5列の遺伝子IDをカンマで区切って、新しいデータフレームを作成
df_expanded <- df2 %>%###マダラでのみ増加した遺伝子のgene_IDとOG番号
separate_rows(V5, sep = ", ") %>%
rename(gene_ID = V5, family_ID = FamilyID)|>
print()
###############################################################
### CAFE5でマダラでのみ増加した遺伝子とその機能のファイル、df3
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
View(fa)
df3=dplyr::left_join(df_expanded, fa, by = c(gene_ID = "Madara"))|>###完成系
print()
write.csv(df3, "CAFE_plus_gene.csv", row.names = FALSE)
##############################################################
###DEG解析との結合
deg1=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")|>
print()
View(deg1)
deg1_merge=dplyr::inner_join(df3, deg1, by = "gene_ID")|>
dplyr::select(!c(family_ID)) |>
print()
View(deg1_merge)
write.csv(deg1_merge, "DEG_CAFE_ovary_vs_body.csv", row.names = FALSE)
deg2=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_Adult_vs_Larva_DESeq2.csv", sep=",")|>
print()
View(deg2)
deg2_merge=dplyr::inner_join(df3, deg2, by = "gene_ID")|>
dplyr::select(!c(family_ID)) |>
print()
View(deg2_merge)
write.csv(deg2_merge, "DEG_CAFE_adult_vs_larva.csv", row.names = FALSE)
#################################################################
print(c(deg1_merge$Dmelanogaster, deg1_merge$Dmel_GeneFunction))
print(deg2_merge$Dmel_GeneFunction)
sng=df3|>
dplyr::mutate(Dmelanogaster = stringr::str_replace(Dmelanogaster, "Dmel_", "")) |>
dplyr::select(Dmelanogaster) |>
print(n = 488)0805
PAMLのテスト続き
前回の反省をもとにヘッダー最後に空白を入れた上でPAML実行。以下はヘッダーに空白を加えるスクリプト。
###
import os
import re
# 処理するディレクトリ
directory = '/home/kosukesano/tools/for_paml/6sp/data/SCO'
# 正規表現パターン
pattern = re.compile(r'^>.*')
# ディレクトリ内のファイルを処理
for filename in os.listdir(directory):
if re.match(r'^OG\d+_maffted\.fna$', filename):
filepath = os.path.join(directory, filename)
with open(filepath, 'r') as file:
lines = file.readlines()
new_lines = []
for line in lines:
if pattern.match(line):
# シーケンス名行には2つ以上のスペースを追加
new_lines.append(re.sub(r'(>.*)', r'\1 ', line.strip()))
else:
# シーケンス行には変更を加えない
new_lines.append(line.strip())
# 整形されたファイルを保存
with open(filepath, 'w') as file:
for line in new_lines:
file.write(line + '\n')
print("All files have been processed.")これを実行した上で、PAMLを再実行。
kosukesano@at137:~/tools/for_paml/test$ python seq_space_plus.py
All files have been processed.
kosukesano@at137:~/tools/for_paml/test$ singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml bsA.ctl
15 verbose | verbose 1.00
7 runmode | runmode 0.00
4 seqtype | seqtype 2.00
13 CodonFreq | CodonFreq 2.00
9 clock | clock 0.00
16 model | model 2.00
20 NSsites | NSsites 0.00
26 fix_omega | fix_omega 0.00
27 omega | omega 1.00
22 icode | icode 0.00
24 fix_kappa | fix_kappa 0.00
25 kappa | kappa 2.00
28 fix_alpha | fix_alpha 1.00
29 alpha | alpha 0.00
30 Malpha | Malpha 0.00
31 ncatG | ncatG 4.00
11 getSE | getSE 0.00
12 RateAncestor | RateAncestor 0.00
8 method | method 0.00
37 fix_blength | fix_blength 0.00
AAML in paml version 4.9, March 2015
processing fasta file
reading seq# 1 lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CEUTPL_LOCUS9995] [protein_id=CAG9769486.1] [location=complement(join(35353935..35354250,35354441..35354709))] [gbkey=CDS] 624 sites
reading seq# 2 lcl|NC_007417.3_cds_XP_008201558.1_3086 [db_xref=GeneID:103315214] [protein=uncharacterized protein C19orf52] [protein_id=XP_008201558.1] [location=complement(11429290..11429874)] [gbkey=CDS] 624 sites
reading seq# 3 lcl|NW_026017110.1_cds_XP_019769948.1_2611 [gene=LOC109544293] [db_xref=GeneID:109544293] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_019769948.1] [location=complement(join(25238952..25239267,25239322..25239623) 624 sites
reading seq# 4 lcl|NW_022146411.1_cds_XP_030754725.1_11873 [gene=LOC115881405] [db_xref=GeneID:115881405] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_030754725.1] [location=complement(join(1076167..1076482,1076551..1076813))] 624 sites
reading seq# 5 lcl|NC_065546.1_cds_XP_050302974.1_1493 [gene=LOC126740812] [db_xref=GeneID:126740812] [protein=mitochondrial import inner membrane translocase subunit Tim29] [protein_id=XP_050302974.1] [location=join(50475192..50475457,50483767..50484082)] [gbkey=CDS] 624 sites
reading seq# 6 g3079.t1 624 sites
ns = 6 ls = 624
Reading sequences, sequential format..
^C
kosukesano@at137:~/tools/for_paml/test$途中で止まって動かない……。
種名がないのが原因なのでは? CDSを取ってくる際にヘッダーに元ファイルの種名をくっつけるように変更。
###/home/kosukesano/tools/for_paml/6sp/data/makefna_plusname.py の中身
import os
# 入力ファイルと出力ディレクトリのパスを設定
extracted_orthogroups_path = '/home/kosukesano/tools/for_paml/6sp/data/extracted_orthogroups.txt'
cds_dir = '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir'
output_dir = '/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname'
# ファイルの拡張子を変数に
file_extension = ".fasta"
# 種ごとのファイル名マッピング
species_to_file = {
"Cass": "Cass.fasta",
"Tcas": "Tcas.fasta",
"Dpon": "Dpon.fasta",
"Sory": "Sory.fasta",
"Agra": "Agra.fasta",
"Smad": "Smad.fasta"
}
# 必要な出力ディレクトリを作成
os.makedirs(output_dir, exist_ok=True)
# `extracted_orthogroups.txt`を読み込み、各オーソログに対して処理
with open(extracted_orthogroups_path, 'r') as infile:
for line in infile:
columns = line.strip().split()
orthogroup_id = columns[0].replace(':', '')
protein_ids = columns[1:]
output_file_path = os.path.join(output_dir, f"{orthogroup_id}{file_extension}")
with open(output_file_path, 'w') as outfile:
for i, protein_id in enumerate(protein_ids):
species = list(species_to_file.keys())[i]
cds_file_path = os.path.join(cds_dir, species_to_file[species])
# `seqkit grep`コマンドを構築して実行
grep_command = f"singularity exec -e /usr/local/biotools/s/seqkit:2.5.0--h9ee0642_0 seqkit grep -r -p '{protein_id}' {cds_file_path}"
result = os.popen(grep_command).read()
# ヘッダーに種名を追加
result = result.replace('>', f'>{species}|', 1)
# 出力ファイルに書き込み
outfile.write(result)
print("All files have been processed.")試しにこれで出力されたOG0008033.fastaをMAFFTでアライメントしてテストプレイ。MAFFTのスクリプトは以下の通り。
###/home/kosukesano/tools/for_paml/6sp/data/mafft_240805test.shの中身
#$ -S /bin/bash
source ~/tools/pyenv_env/ManualPhilo_profile
# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname/"
output_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO/"
# 各ファイルに対してアラインメントを実行
for file in "$input_dir"OG0008033.fasta; do
# 元のファイル名から拡張子を除いたものを取得
base_name=$(basename "$file" .fasta)
# 出力ファイル名を生成
output_file="${output_dir}${base_name}_maffted.fasta"
# MAFFTを実行
mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"
echo "Aligned file created: $output_file"
doneこれでできたファイルは以下の通り。
>Cass|lcl|OU892281.1_cds_CAG9769486.1_7692 [locus_tag=CEUTPL_LOCUS9995] [protein_id=CAG9769486.1] [location=complement(join(35353935..35354250,35354441..35354709))] [gbke
y=CDS]
---------------------------------atggcttattttcataaactaggtcac
cgatttttca------caacaaaagcaatccaaaactggaattccaaaagtgagaaattc
aatgagaaaataaagggaaccatagttgaaaaatgggtaaaatattggaaacttgttacc
aaagattacaaggaagtaggtttatctgttaaacaagaaattaaagacaaacctctaaaa
actattgtgtattttactggagcagctttattcggtttgtgttgggagttaaacccaaac
ttgcaaagtttcagggcaacatatatagcatcagccaacgatctaagtttagtacctctt
actctagcgaacccaaattcagtagaacatttaaaacacattgaacaatgtttcaatcga
aaatatataagatatacaaaccttggacttttatcattaatatgggtagataaatttagt
gaagaatgcgatttatatgagagcaactgctcataccttaaagttcctttttataaaata
acaggaagaattttagatgttggctttctaaatgtatggtggattatttctagaagaatg
ctcgattatgatataaattattag
.
.
.これでPAMLを実行。実行の様子は以下の通り
kosukesano@at138:~/tools/for_paml/test$ singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml 240805_bsA.ctl
15 verbose | verbose 1.00
7 runmode | runmode 0.00
4 seqtype | seqtype 2.00
13 CodonFreq | CodonFreq 2.00
9 clock | clock 0.00
16 model | model 2.00
20 NSsites | NSsites 0.00
26 fix_omega | fix_omega 0.00
27 omega | omega 1.00
22 icode | icode 0.00
24 fix_kappa | fix_kappa 0.00
25 kappa | kappa 2.00
28 fix_alpha | fix_alpha 1.00
29 alpha | alpha 0.00
30 Malpha | Malpha 0.00
31 ncatG | ncatG 4.00
11 getSE | getSE 0.00
12 RateAncestor | RateAncestor 0.00
8 method | method 0.00
37 fix_blength | fix_blength 0.00
AAML in paml version 4.9, March 2015
processing fasta file
reading seq# 1 Cass|lcl|OU892280.1_cds_CAG9767834.1_5986 [locus_tag=CEUTPL_LOCUS8389] [protein_id=CAG9767834.1] [location=complement(20400831..20402546)] [gbkey=CDS] 1734 sites
reading seq# 2 Tcas|lcl|NC_007418.3_cds_XP_001812254.1_4585 [db_xref=GeneID:656905] [protein=SRSF protein kinase 1] [protein_id=XP_001812254.1] [location=complement(join(4772053..4772586,4775214..4776362))] [gbkey=CDS] 1734 sites
reading seq# 3 Dpon|lcl|NW_026018611.1_cds_XP_019769583.1_13918 [gene=LOC109544031] [db_xref=GeneID:109544031] [protein=SRSF protein kinase 2] [protein_id=XP_019769583.1] [location=complement(5272139..5273848)] [gbkey=CDS] 1734 sites
reading seq# 4 Sory|lcl|NW_022146996.1_cds_XP_030760502.1_17223 [gene=LOC115885665] [db_xref=GeneID:115885665] [protein=SRSF protein kinase 1] [protein_id=XP_030760502.1] [location=complement(5794500..5796164)] [gbkey=CDS] 1734 sites
reading seq# 5 Agra|lcl|NC_065547.1_cds_XP_050292688.1_2936 [gene=LOC126750744] [db_xref=GeneID:126750744] [protein=SRSF protein kinase 2] [protein_id=XP_050292688.1] [location=complement(join(29075850..29076844,29099051..29099762))] [gbkey=CDS] 1734 sites
reading seq# 6 Smad|g5339.t1 1734 sites
ns = 6 ls = 1734
Reading sequences, sequential format..
Reading seq # 1: Cass|lcl|OU892280.1_cds_CAG9767834.1_5986 [locus_t
Error in sequence data file: U at 5 seq 1.
Make sure to separate the sequence from its name by 2 or more spaces.
kosukesano@at138:~/tools/for_paml/test$ 空白入れたはずなのにエラー?
まっちゃん先輩の残したデータではヘッダー行に種名以外何もなかったな…。手動で種名以下を削除し、タブを追加してやってみる。
>Cass
atgagctcgaaagtggacgtaaatcgacgtattctggctatccaggctaagaagaaacgc
cataagcccaacaagaagaaaggcaagaacgataatatgaatggacatggggagaatcgg
atccgt---tcgaaaaacgagccttcccacagttccagcaatgagactatcgaggacccg
gatacaccgtatacaagtgatgaagaagaacaagaggacagcaccgattatcagaaggga
ggataccaccccgtcaagattggcgacctctttcttggaaggtatcatgtcactagaaaa
ttaggttggggtcatttttccactgtttggctttgctgggatctcgaagacaaacgattt
gtagctttaaaaattgtaaaatcagctaaacatttcactgaaactgctttggatgaaatc
aaaatcctcagatcagtccgcgactctgatccacaagaccccaaaaggaacaaaacagtc
caacttctgaatgatttcaaaataagtggggttaatggggtgcatgtgtgcatggtcttc
gaagttcttggtcatcatttattaaaacttataataaaatccaattaccgaggcatccca
ttggccaatgttcgtactataatgcgacaagttttagaaggtctagattatttacattca
aagtgcaaaataatccatacagacataaaaccagaaaatgtacttgtatgtgtctctgaa
gaatatattagacggcttgcttgtgaagcagccgaaatgcaccaattaggagttaaacta
ccaacttctcttataagcactgcacctccacaagaagcacctccccaaaaaatgagcaaa
aataaaaagaaaaaactcaaaaaaaaggctaagaggcaaaatgaacttctcaaaaaacaa
atggaacaaattatcgagattgaagaaaagaagaaagttagcaaagaaaatggtgatgtt
aatgatgatgttaatgatgatgatatagagtgtaataattgtacaaatgatgaagaagtc
gctaa---tgataaaattattaatggt---gtagatgagattggtggtggagaaaat---
atcccttgtgatgaacc---gtctattgctgaccctgttgtgataatgtctgaagatgac
tctccttctctaacttcaaaaagtgaaagtaaaatggaattagatccagcctttgttgaa
tgtgattttgaggtcaaaattgctgacctcggaaatgcttgttgggtcgacaaacatttc
acagaagacatccaaacaagacaatacagatctttggaagttctacttggtgctggctat
aatacttcagccgatatttggagcactgcttgcatggcctttgaattagccactggagac
tatttatttgaaccacattctggagaagattattgcagagatgaagaccatttagcccat
atcattgagttattgggaaacattccgaaaagaattgcccaaagtggaacaaattctaaa
ttatttttcaacaagaaaaatgaacttcgccatattacagggttgaaaccatggggtctt
gaagatgtgttgcaggaaaaatatgagtggccgcccaaaaatgcccgcgaatttgcaggc
ttcctgaaaccaatgttggactttgatccggacaaaagggccactgcagcagaatgtctg
aagcatccatggttgaacaataat---gaaccctcgctctctgtaggtgactga
>Tcas
atgagcgcaaaattggacgtaaatagacgtgtcttagctatccaagctaaaaagaaacga
cataagccagctaagaaaaaaggtaagaacg---aaatgaacggccacggggaaaaccgg
atcaat---tcgaagaccgagccctcgcacagctccagcaatgagacgatcgaagaccag
gacgacccgtacacgagcgaggaggaggagcaagaagacagcaatgactaccggaaaggg
ggctaccatcctgtcaaaatcggggacctgttcctcaaccgctatcacgtcacgcggaag
ctgggctggggccacttctccaccgtgtggctgtgctgggacctgcaggaccggcggttc
gtggccctgaagatcgtcaaatcggccgaacacttcaccgaaacggcgcttgacgaaatc
aaaattttaaaagcggtgcgggagtccgaccccacggaccccaaacgcaacaagactgtc
cagttgttgaacgacttcaagatcagcggaatcaacggcgtgcacgtgtgcatggtcttt
gaagtgcttggccaccacctgttaaagctaattatcaaatcgaactaccgagggatccct
ctggacaacgtccgcacaatcatgcggcaggttctggaaggtctcgactatttgcatacg
aaatgtaaaataatccacaccgatatcaagcccgaaaacgtcctgatttgtgttagtgaa
gagtatatcaggaggctggcgtgcgaggcggcggaaatgcaccatctaggcttaaaatta
cccacgtctcttataagcaccgcaccggtccaggaagtacaagcgtcgaaaatgagcaaa
aacaagaagaagaagctgaagaagaaggcgaaacgacttaatgagctacttaaacggcag
atggagcaaatcatagagattgaggagcagaagaaggt---gaaggaaaacggcgatgtg
gcgactgataacgactgcaatggaactagt---ccgagtc---ccgagacgacgcccgag
ggccccgaagacaaactctccaacggttgccttgacgaactcgccgggggcgag-これを使ってPAMLを実行。実行結果は以下の通り。
kosukesano@at138:~/tools/for_paml/test$ singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml 240805_bsA.ctl
15 verbose | verbose 1.00
7 runmode | runmode 0.00
4 seqtype | seqtype 2.00
13 CodonFreq | CodonFreq 2.00
9 clock | clock 0.00
16 model | model 2.00
20 NSsites | NSsites 0.00
26 fix_omega | fix_omega 0.00
27 omega | omega 1.00
22 icode | icode 0.00
24 fix_kappa | fix_kappa 0.00
25 kappa | kappa 2.00
28 fix_alpha | fix_alpha 1.00
29 alpha | alpha 0.00
30 Malpha | Malpha 0.00
31 ncatG | ncatG 4.00
11 getSE | getSE 0.00
12 RateAncestor | RateAncestor 0.00
8 method | method 0.00
37 fix_blength | fix_blength 0.00
AAML in paml version 4.9, March 2015
processing fasta file
reading seq# 1 Cass 1734 sites
reading seq# 2 Tcas 1734 sites
reading seq# 3 Dpon 1734 sites
reading seq# 4 Sory 1734 sites
reading seq# 5 Agra 1734 sites
reading seq# 6 Smad 1734 sites
ns = 6 ls = 1734
Reading sequences, sequential format..
Reading seq # 1: Cass
Reading seq # 2: Tcas
Reading seq # 3: Dpon
Reading seq # 4: Sory
Reading seq # 5: Agra
Reading seq # 6: Smad
Sequences read..
Counting site patterns.. 0:00
477 patterns at 1734 / 1734 sites (100.0%), 0:00
Counting frequencies..
120 bytes for distance
152640 bytes for conP
0 bytes for fhK
5000000 bytes for space
Species 97?
kosukesano@at138:~/tools/for_paml/test$ less result/OG0008033_branch_alt_240805
kosukesano@at138:~/tools/for_paml/test$ ls
240805_bsA.ctl 2NG.dN 2NG.dS 2NG.t bsA.ctl bsAtest.sh bsAtest.sh.e26312072 bsAtest.sh.o26312072 data lnf result rst rst1 rub seq_space_plus.py
kosukesano@at138:~/tools/for_paml/test$結果のファイルは以下の通り
###~/tools/for_paml/test/result/OG0008033_branch_alt_240805の中身
Homogeneity statistic: X2 = 0.07874 G = 0.07932
Average 0.32401 0.00000 0.00000 0.00000 0.20852 0.00000 0.00000 0.24080 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.00000 0.22667 0.00000 0.00000 0.00000
(Ambiguity characters are used to calculate freqs.)
# constant sites: 917 (52.88%)
AA distances (raw proportions of different sites)
Cass
Tcas 0.3224
Dpon 0.2463 0.2912
Sory 0.2520 0.2993 0.2566
Agra 0.2607 0.2895 0.2641 0.2785
Smad 0.2163 0.2780 0.2174 0.2347 0.2480できてる!
CDSを取り直したので、MAFFTをかける。また、ついでにヘッダーの処理も行う。使用したスクリプトは以下の通り。
###~/tools/for_paml/6sp/data/mafft_plusname.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
source /home/kosukesano/tools/pyenv_env/ManualPhilo_profile
# 入力ディレクトリと出力ディレクトリのパスを設定
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname/"
output_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname/"
# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fasta; do
# 元のファイル名から拡張子を除いたものを取得
base_name=$(basename "$file" .fasta)
# 出力ファイル名を生成
output_file="${output_dir}${base_name}_maffted.fasta"
# MAFFTを実行
mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"
echo "Aligned file created: $output_file"
# ヘッダーを整形
awk '/^>/ {split($0, a, "|"); print ">" a[1] "\t"} !/^>/ {print}' "$output_file" > "${output_dir}${base_name}_maffted_fixed.fasta"
echo "Fixed headers for file: ${output_dir}${base_name}_maffted_fixed.fasta"
done
~PAMLの再帰的な実行
まずブランチサイトモデルについて行う。~/tools/for_paml/6sp/bsAディレクトリを作成、その中で以下のスクリプトを用意した。
- run_paml.sh
- template.ctl
### run_paml.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname"
bsA_dir="/home/kosukesano/tools/for_paml/6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fasta; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fasta)
outfile_path="$result_dir/${base_name}_branch_alt"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### template.ctlの中身
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/test/data/Orthofinder_tree_convert.nwk
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0また、/home/kosukesano/tools/for_paml/test/data/ディレクトリにOrthofinder_tree_convert.nwkを作成した。
### Orthofinder_tree_convert.nwkの中身
(Tcas:0.177097,(Sory:0.19234,((Dpon:0.181257,(Cass:0.179451,Smad#1:0.145856):0.0217713):0.0202496,Agra:0.176422):0.0384627):0.177097);これは元々Orthofinder出力のSpecies_Tree/SpeciesTree_rooted.txt
### SpeciesTree_rooted.txtの中身
(Tcas:0.177097,(Sory:0.19234,((Dpon:0.181257,(Cass:0.179451,Smad:0.145856)0.253106:0.0217713)0.232305:0.0202496,Agra:0.176422)0.60296:0.0384627)1:0.177097);何か変な数字(ブートストラップ値?)がついていたので、手動で除外したのがSpecies_Tree/SpeciesTree_rooted.txt。
PAMLの実行自体はrun_paml.shをqsubで投げることで行った。
続いてブランチサイトの帰無仮説の方。~/tools/for_paml/6sp/bs_nullディレクトリを作成、その中で以下のスクリプトを用意した。
- bsN_run_paml.sh
- bsN_template.ctl
###bsN_run_paml.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname"
bsA_dir="/home/kosukesano/tools/for_paml/6sp/bs_null"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/bsN_template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fasta; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fasta)
outfile_path="$result_dir/${base_name}_branch_alt_null"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done###bsN_template.ctlの中身
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/test/data/Orthofinder_tree_convert.nwk
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 1
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0これも同様にqsubで投げた。
0806
PAML出力のまとめ、尤度比検定
昨日のジョブがうまくいって終わったので、結果を比較する。尤度比検定用のスクリプトは以下の通り。
###~/tools/for_paml/6sp/bs_lrp.pyの中身
import os
import re
from scipy.stats import chi2
def parse_lnL(file_path):
with open(file_path, 'r') as f:
for line in f:
match = re.search(r'lnL\(ntime: \d+ np: (\d+)\):\s+(-?\d+\.\d+)', line)
if match:
np = int(match.group(1))
lnL = float(match.group(2))
return np, lnL
return None, None
def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
lr_stat = 2 * (alt_lnL - null_lnL)
df = alt_np - null_np
p_val = chi2.sf(lr_stat, df)
return p_val
def main():
alt_dir = '~/tools/for_paml/6sp/bsA/result'
null_dir = '~/tools/for_paml/6sp/bs_null/result'
output_file = 'branch_site_lrt_results.txt'
alt_dir = os.path.expanduser(alt_dir)
null_dir = os.path.expanduser(null_dir)
og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_branch_alt' in f]
with open(output_file, 'w') as out_f:
out_f.write('OG_num\tp_val\tpositive_selection\n')
for og_file in og_files:
og_num = og_file.split('_')[0]
alt_file = os.path.join(alt_dir, og_file)
null_file = os.path.join(null_dir, og_file.replace('_maffted_fixed_branch_alt', '_maffted_fixed_branch_alt_null'))
if os.path.exists(null_file):
alt_np, alt_lnL = parse_lnL(alt_file)
null_np, null_lnL = parse_lnL(null_file)
if alt_np is not None and null_np is not None:
p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
reject_null = '+' if p_val < 0.05 else '-'
out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')
if __name__ == "__main__":
main()結果は~/tools/for_paml/6sp/branch_site_lrt_results.txtとして出力された。
### ~/tools/for_paml/6sp/branch_site_lrt_results.txtの中身
G_num p_val positive_selection
OG0008991 1.0 -
OG0008220 1.0 -
OG0009516 0.02359893128372939 +
OG0009076 1.0 -
OG0009448 1.0 -
OG0010062 1.0 -
OG0009276 1.0 -
OG0009923 1.0 -
OG0009794 0.27039658785634013 -
OG0008669 1.0 -
OG0009998 0.5081029121888809 -
OG0009946 0.8522214828985033 -
OG0009111 0.6973912229476 -
OG0009787 1.0 -
OG0008626 0.2573023006043861 -
OG0009987 1.0 -
OG0009641 0.5390711466832219 -
OG0009570 1.0 -
OG0008058 1.0 -
OG0009949 0.34527143821093376 -
OG0008996 0.705351995899238 -
OG0008847 1.0 -
OG0009812 1.0 -
OG0009515 0.42550751108380647 -
OG0009410 1.0 -
OG0008868 0.28057388758663304 -
.
.
.
.
.
.これをローカル環境に持っていき、遺伝子機能のファイルと照合した。
###~/bio/for_cafe/caferesult.Rの一部
###############################################################
#マダラの遺伝子番号とOGの紐付け
df4 <- orthogroups %>%###
separate_rows(V5, sep = ", ") %>%
rename(gene_ID = V5, family_ID = V1)|>
print()
View(df4)
df5=dplyr::left_join(df4, fa, by = c(gene_ID = "Madara"))|>###完成系
print()
#################################################################
paml<-read.csv("branch_site_lrt_results.txt", sep="\t")
print(paml)
paml_po=paml|>
dplyr::filter(positive_selection == "+") |>
print()
df6=dplyr::inner_join(paml_po, df5, by = c(OG_num = "family_ID"))|>
print()
View(df6)正の選択が検出された遺伝子は15個であった。
0814
PAMLの多重検定とp値の補正
多重検定における尤度比の補正を行った。遺伝研が不具合でログインできなかったため、ローカルで実行。
まずpaml用のディレクトリを作成し、その中にbranch_site_lrt_results.txtをコピー。
:~/bio$ mkdir for_paml
:~/bio$ ls
DEG_Adult_vs_Larva_DESeq2.csv SRR11742112_2.fastq SRR9665770_report1.html for_cafe madara_annotated qc_SRR9665770_2.fq
DEG_ovary_vs_body_DESeq2.csv SRR9665770 fastp.json for_eggnoc memo.txt
SRR11742112 SRR9665770_1.fastq femo_annotated for_paml new_rbh.py
SRR11742112_1.fastq SRR9665770_2.fastq for_blast_test functional_annotation qc_SRR9665770_1.fq
:~/bio$ cd for_paml/
:~/bio/for_paml$ pwd
/Users/kosukesano/bio/for_paml
:~/bio/for_paml$ cd ../for_cafe/
:~/bio/for_cafe$ ls
CAFE_plus_gene.csv ManualPhylo_2.py Rplot01.png caferesult.R for_sinkagakkai.png
DEG_CAFE_adult_vs_larva.csv ManualPhylo_3.py Rplot02.png caferesult_6sp.png ogfil.py
DEG_CAFE_ovary_vs_body.csv Original_data ThroughoutCAFE.R caferesult_6sp_iqtree.png old_result
Deg Processed_data branch_site_lrt_results.txt cleaned_orthogroups.tsv out_madara_SP.txt
ManualPhylo_1.py Rplot.png bs_positive_gene.csv for_cafe.Rproj tree_ultrametric.nwk
:~/bio/for_cafe$ cp branch_site_lrt_results.txt ../for_paml/
:~/bio/for_cafe$ cd ../for_paml/
:~/bio/for_paml$ ls
branch_site_lrt_results.txt
:~/bio/for_paml$次にPythonの仮想環境paml_hoseiを作成。
:~/bio/for_paml$ python3 -m venv paml_hosei
:~/bio/for_paml$ source paml_hosei/bin/activate
(paml_hosei) :~/bio/for_paml$ pip install pandas
Collecting pandas
Downloading pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl.metadata (19 kB)
Collecting numpy>=1.26.0 (from pandas)
Downloading numpy-2.0.1-cp312-cp312-macosx_10_9_x86_64.whl.metadata (60 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.9/60.9 kB 4.1 MB/s eta 0:00:00
Collecting python-dateutil>=2.8.2 (from pandas)
Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl.metadata (8.4 kB)
Collecting pytz>=2020.1 (from pandas)
Downloading pytz-2024.1-py2.py3-none-any.whl.metadata (22 kB)
Collecting tzdata>=2022.7 (from pandas)
Downloading tzdata-2024.1-py2.py3-none-any.whl.metadata (1.4 kB)
Collecting six>=1.5 (from python-dateutil>=2.8.2->pandas)
Downloading six-1.16.0-py2.py3-none-any.whl.metadata (1.8 kB)
Downloading pandas-2.2.2-cp312-cp312-macosx_10_9_x86_64.whl (12.5 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 12.5/12.5 MB 58.6 MB/s eta 0:00:00
Downloading numpy-2.0.1-cp312-cp312-macosx_10_9_x86_64.whl (21.0 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 21.0/21.0 MB 39.4 MB/s eta 0:00:00
Downloading python_dateutil-2.9.0.post0-py2.py3-none-any.whl (229 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 229.9/229.9 kB 18.1 MB/s eta 0:00:00
Downloading pytz-2024.1-py2.py3-none-any.whl (505 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 505.5/505.5 kB 20.4 MB/s eta 0:00:00
Downloading tzdata-2024.1-py2.py3-none-any.whl (345 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 345.4/345.4 kB 34.5 MB/s eta 0:00:00
Downloading six-1.16.0-py2.py3-none-any.whl (11 kB)
Installing collected packages: pytz, tzdata, six, numpy, python-dateutil, pandas
Successfully installed numpy-2.0.1 pandas-2.2.2 python-dateutil-2.9.0.post0 pytz-2024.1 six-1.16.0 tzdata-2024.1
[notice] A new release of pip is available: 24.0 -> 24.2
[notice] To update, run: pip install --upgrade pip
(paml_hosei) :~/bio/for_paml$ 次に実行スクリプトのhosei.pyを作成。
### hosei.pyの中身
import pandas as pd
from statsmodels.stats.multitest import multipletests
# 入力ファイルと出力ファイルのパス
input_file = '/Users/kosukesano/bio/for_paml/branch_site_lrt_results.txt'
output_file = '/Users/kosukesano/bio/for_paml/hosei_branch_site_lrt_results.txt'
# ファイルを読み込む
df = pd.read_csv(input_file, sep='\t')
# p値をリストに変換
p_values = df['p_val'].tolist()
# FDR制御を使用してp値を補正
rejected, q_values, _, _ = multipletests(p_values, alpha=0.05, method='fdr_bh')
# 補正後のq値と有意差をデータフレームに追加
df['q_val'] = q_values
df['significant'] = rejected
# 結果を新しいファイルに出力
df.to_csv(output_file, sep='\t', index=False)
print(f"補正後の結果が{output_file}に保存されました。")実行
(paml_hosei) :~/bio/for_paml$ python hosei.py
補正後の結果が/Users/kosukesano/bio/for_paml/hosei_branch_site_lrt_results.txtに保存されました。
(paml_hosei) :~/bio/for_paml$ 結果
3つの遺伝子で有意差が認められた。 - g4236.t1 acyl-CoA dehydrogenase family member 9, mitochondrial - g9945.t1 laminin subunit alpha - g12267.t1 D-glucuronyl C5-epimerase B
2024年9月
0903
ASTRALの実行
ASTRAL.sh.o26291746の記述は以下の通り。
start at
2024年 7月 31日 水曜日 23:02:59 JST
Error occurred during initialization of VM
Could not reserve enough space for 629145600KB object heap
2024年 7月 31日 水曜日 23:03:00 JSTヒープサイズ?を設定する値が大きすぎてエラーが出ているっぽい。
ASTRAL.shを見ると確かに-Xmx629145600Kとなっていた。これを以下のように変更。
### ASTRAL.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 16
echo start at
date
java -Xmx2G -jar astral.5.7.8.jar \
-i /home/kosukesano/tools/for_ASTRAL/Astral/data/modified_trees.nwk \
-o /home/kosukesano/tools/for_ASTRAL/Astral/240903_result/out.tre \
2>/home/kosukesano/tools/for_ASTRAL/Astral/240903_result/out.log
date-Xmx2Gにした。ついでに出力ファイルも240903_resultとした。
kosukesano@at139:~/tools/for_ASTRAL/Astral$ mkdir 240903_result240903_resultを作成したのち、ASTRAL.shをqsubで投げた。
結果
kosukesano@at139:~/tools/for_ASTRAL/Astral$ ls 240903_result/
out.log out.tre2つのファイルが出力された。それぞれのファイルの中身は以下の通り。
### out.logの中身
================== ASTRAL =====================
This is ASTRAL version 5.7.8
Gene trees are treated as unrooted
1518 trees read from /home/kosukesano/tools/for_ASTRAL/Astral/data/modified_trees.nwk
index0
All output trees will be *arbitrarily* rooted at Agra
======== Running the main analysis
Number of taxa: 6 (6 species)
Taxa: [Agra, Cass, Dpon, Smad, Sory, Tcas]
Taxon occupancy: {Cass=1518, Sory=1518, Tcas=1518, Agra=1518, Smad=1518, Dpon=1518}
Number of gene trees: 1518
0 trees have missing taxa
Calculating quartet distance matrix (for completion of X)
Species tree distances calculated ...
Building set of clusters (X) from gene trees
------------------------------
gradient0: 63
Number of Clusters after addition by distance: 63
calculating extra bipartitions to be added at level 1 ...
Adding to X using resolutions of greedy consensus ...
Limit for sigma of degrees:200
polytomy size limit : 4
discarded polytomies: [3, 4]
Threshold 0.0:
Threshold 0.01:
Threshold 0.02:
Threshold 0.05:
Threshold 0.1:
Threshold 0.2:
polytomy of size 3; rounds with additions with at least 5 support: 0; clusters: 63
Threshold 0.3333333333333333:
polytomy of size 4; rounds with additions with at least 5 support: 0; clusters: 63
max k is :0
Number of Clusters after addition by greedy: 63
gradient0 in heuristiic: 63
partitions formed in 0.477 secs
Dynamic Programming starting after 0.477 secs
Using tree-based weight calculation.
Using polytree-based weight calculation.
Polytree max score: 22770
Polytree building time: 0.061 seconds.
Number of quartet trees in the gene trees: 22770
Size of largest cluster: 6
Greedy score: 11289
estimationFactor: 2.017007706617061
Sub-optimal score: 11289
Total Number of elements weighted: 136
Normalized score (portion of input quartet trees satisfied before correcting for multiple individuals): 0.4957839262187088
Optimization score: 11289
Optimal tree inferred in 0.694 secs.
(Cass,(Smad,(Dpon,(Agra,(Sory,Tcas)))));
Final quartet score is: 11289
Final normalized quartet score is: 0.4957839262187088
Extended species tree:
(Agra,((Sory,Tcas)1:0.7104118856165281,(Dpon,(Smad,Cass)0.63:0.017264705056582508)1:0.07499457563408858));
Weight calculation took 0.024812803 secs
ASTRAL finished in 1.732 secs### out.treの中身
(Agra,((Sory,Tcas)1:0.7104118856165281,(Dpon,(Smad,Cass)0.63:0.017264705056582508)1:0.07499457563408858):0.0); これをもとに書いた系統樹は以下の通り。
A. grandisが最外群になってる〜!
out.txtの途中に出てた枝長などがない系統樹をコピーして~bio/240903_ASTRAL_Optimal_tree.treとして保存し描画してみる。
tree2 = read.tree("/Users/kosukesano/bio/240903_ASTRAL_Optimal_tree.tre")
p=ggtree(tree2)+
xlim(0, 7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
p一応Tcasが最外群にはなっている?
過去の系統樹はこちら
Dmelanogasterを加えた系統樹作成、それの準備としてのOrthofinder
外群が外群として機能していないので、一度絶対に外群だろうという昆虫(今回はキイロショウジョウバエ)を加えて系統樹を描いてみる。そのためにまずOrthoFinderを実行する。
~/tools/for_orthofinderディレクトリにSmad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dirディレクトリを作成。その中に7種の昆虫のアミノ酸配列ファイルを格納した。
kosukesano@at139:~/tools/for_orthofinder$ ls Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/
Agra.fasta Cass.fasta Dmel.fasta Dpon.fasta Smad.fasta Sory.fasta Tcas.fasta
kosukesano@at139:~/tools/for_orthofinder$ また、seven_sp.shを作成した。
### seven_sp.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder -f ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir -t 5 -a 5
dateこれをqsubで投げた。
また、手動での系統樹作成アルゴリズムの1つである、ManualPhylo_1.pyを作成し実行する。
まず、~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03下にManualPhylo_dataディレクトリを作成した。
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03$ mkdir ManualPhilo_data0904
7種の昆虫ゲノムを使ったOrthoFinderが終わり、無事出力された。
OrthoFinder出力の系統樹はこんな感じ。
tree3 = read.tree("/Users/kosukesano/bio/7sp.tre")
p=ggtree(tree3, branch.length = 'none')+
xlim(0,9)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.1, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
p7種の昆虫ゲノムのアミノ酸配列fastaファイルを統合し、1つのファイルにする。
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree下でfasta_concatinate.shを作成した。fasta_concatinate.shの中身は以下の通り。
### fasta_concatinate.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir" ## Please replace with the actual directory containing the fasta files
# Define the output directory and output file
new="output_directory"
mkdir -p $new
# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
cat "$file" >> "./${new}/all_seq.fa"
done
dateこれを作業ノードで実行権限を与えて実行した。
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree$ chmod +x fasta_concatinate.sh
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree$ ./fasta_concatinate.sh結果として、~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree/output_directoryにall_seq.faができた。
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree/output_directory$ ls
all_seq.fa続いて、~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree下でManualPhylo_2.pyを作成し実行した。
ManualPhylo_2.pyの中身は以下の通り。0718のManualPhylo_2.pyではOGがきちんと抽出できないので注意!
### ManualPhylo_2.pyの中身
##ManualPhylo_1.pyの続き
import sys
from Bio import SeqIO
path = "../ManualPhylo_data/"
fasta_in = sys.argv[1] #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2] #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する
for q in open(query_in, "r"): #オーソログファイルを開いて1行づつ読み込む
query = q.split() #スペース毎に切りとってリスト形式でqueryに保存する
f = open(path + query[0], 'w') #最初の列(OG名)と同じ名前のファイルを作成する
for record in SeqIO.parse(fasta_in, 'fasta'): #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
id_part = record.id #fastaのID部分を読み込む
desc_part = record.description #fastaのdescription部分を読み込む
seq = record.seq #fastaの配列部分を読み込む
for i in range(len(query)): #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
if id_part == query[i] or desc_part == query[i] : #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
fasta_seq = '>' + id_part + ' ' + desc_part + '\n' + seq + '\n' #fasta形式に整え
print(fasta_seq) #標準出力にfastaを出力(進行状況把握用)
f.write(str(fasta_seq)) #各OGファイルにfastaを出力
f.close()
##できたOGファイルは、align.shやOG_list.txtと同じ場所に
##align.shのある場所までいき、作動。cwdを231016/ManualPhylo_dataにしないとtrimalが作動せず、イライラこれにより630個のSCOが抽出された。
kosukesano@at139:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data$ ls -1 | wc -l
632
### ls -1 | wc -lでファイル数をカウント。OG_list.txtとspecies_list.txtがあるので-2する。続いて、MAFFTとtrimAlによってOGをアライメントする。
まずMAFFTとtrimAlの環境を立ち上げる。
source ~/tools/pyenv_env/ManualPhilo_profile続いて、~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data下にalign.shを作成、実行する。
align.shの中身は以下の通り。
### align.shの中身
#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
mafft --auto $x > $x.maffted.fa
trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -htmlout $x.maffted.trimed.fa.html -automated1
done実行コマンドは以下の通り
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data$ sh align.sh OG_list.txt続いて~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_treeにてManualPhylo_3.pyを作成、実行した。
ManualPhylo_3.pyの中身は以下の通り。
### ManualPhylo_3.pyの中身
##align.shした後
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import sys
from Bio import SeqIO
path = "../ManualPhylo_data/"
query_in = sys.argv[1] #1番目の引数には上記のOG_list.txtなどオーソログファイルを指定する
species_in = sys.argv[2] #2番目の引数にはOG_list.txtと同じ順番で種名を記述したファイルを指定する
sp_list = []
for sp in open(species_in, "r"): #種名ファイルを開く
sp_list.extend(sp.split()) #スペース毎に切りとってリスト形式でsp_listに保存する
for q in open(query_in, "r"): #オーソログファイルを開いて1行づつ読み込む
query = q.split() #スペース毎に切りとってリスト形式でqueryに保存する
f = open(path + query[0]+".maffted.trimed.edit.fa", 'w')
for record in SeqIO.parse(path + query[0]+".maffted.trimed.fa", 'fasta'): #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
desc_part = record.description #fastaのdescription部分を読み込む
seq = record.seq #fastaの配列部分を読み込む
desc_part_new = desc_part.split()[0]
for i in range(len(query)-1): #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す
if desc_part_new == query[i+1] : #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
fasta_seq = '>' + sp_list[i] + '\n' + seq + '\n' #配列名を該当する種名に置き換えて、fasta形式に整えて
print(fasta_seq) #標準出力にfastaを出力(進行状況把握用)
f.write(str(fasta_seq)) #各OGファイルにfastaを出力
f.close()
##scorpionでnano run.nexを変更し、以下を実行
##iqtree –sp run.nex –nt AUTO –bb 1000
~実行コマンド
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/make_philo_tree$ python ManualPhylo_3.py ../ManualPhylo_data/OG_list.txt ../ManualPhylo_data/species_list.txt続いて~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data下にmakealltree.shを作成、qsubでジョブとして投げた。
makealltree.shの中身は以下の通り。
### makealltree.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"
# 作業ディレクトリに移動
cd ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data
# 出力ファイル
output_file="all_trees.nwk"
# 既存の出力ファイルを削除
if [ -f $output_file ]; then
rm $output_file
fi
# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.edit.fa; do
# ファイル名から拡張子を除いたベース名を取得
base_name=$(basename $file .maffted.trimed.edit.fa)
# Singularityを使用してIQ-TREEを実行して系統樹を作成
singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}
# 作成された系統樹ファイル (.treefile) を output_file に追加
if [ -f ${base_name}.treefile ]; then
echo -n "${base_name}: " >> $output_file
cat ${base_name}.treefile >> $output_file
echo "" >> $output_file
else
echo "Error: ${base_name}.treefile not found" >&2
fi
done
echo "All trees have been written to $output_file"
date0905
昨日のIQ-TREEはうまくいっており、最終出力のall_trees.nwkも出力されていた。
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data$ ls all_trees.nwk
all_trees.nwkこれをASTRAL用のディレクトリにコピー。
~/tools/for_ASTRAL/Astral/dataには以前のインプットファイルがあったため、6spと7spというディレクトリを作りそこに格納。
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
all_trees.nwk modified_trees.nwk modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ mkdir 6sp
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
6sp all_trees.nwk modified_trees.nwk modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ mv *.nwk 6sp
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
6sp modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ mkdir 7sp
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$ ls
6sp 7sp modify.py
kosukesano@at137:~/tools/for_ASTRAL/Astral/data$7spの方に今回のall_trees.nwkをコピーし、modify.pyでOG番号を切り取った。
modify.pyの中身は以下の通り。
### modify.pyの中身
# 元のファイルと新しいファイルのパスを設定
input_file_path = '7sp/all_trees.nwk'
output_file_path = '7sp/modified_trees.nwk'
# 元のファイルを開いて処理
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
for line in infile:
# 行を ': ' で分割し、要素が2つ以上の場合のみ処理
parts = line.split(': ', 1)
if len(parts) > 1:
modified_line = parts[1]
# 新しいファイルに書き込み
outfile.write(modified_line)これを使ってASTRALを実行。使ったスクリプトは以下の通り。
### ASTRAL.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 16
echo start at
date
java -Xmx2G -jar astral.5.7.8.jar \
-i /home/kosukesano/tools/for_ASTRAL/Astral/data/7sp/modified_trees.nwk \
-o /home/kosukesano/tools/for_ASTRAL/Astral/240905_result/out.tre \
2>/home/kosukesano/tools/for_ASTRAL/Astral/240905_result/out.log
dateこれをqsubで投げた。
欠失1つを許したASTRALの作成
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/Orthogroups下でSCOwiith0tyusyutu.pyを作成、実行した。
### SCOwiith0tyusyutu.pyの中身
import pandas as pd
# ファイルを読み込む
file_path = "Orthogroups.GeneCount.tsv"
df = pd.read_csv(file_path, sep="\t")
# 各列が1か0で、1つの列が0で他が全て1の行を抽出
filtered_df = df[
((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
((df['Agra'] == 0) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
((df['Agra'] == 1) & (df['Cass'] == 0) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 0) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 0) & (df['Sory'] == 1) & (df['Tcas'] == 1)) |
((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 0) & (df['Tcas'] == 1)) |
((df['Agra'] == 1) & (df['Cass'] == 1) & (df['Dpon'] == 1) & (df['Smad'] == 1) & (df['Sory'] == 1) & (df['Tcas'] == 0))
]
# Orthogroup 列のみを抽出
filtered_df = filtered_df[['Orthogroup']]
# 抽出したデータを新しいファイルに保存 (ヘッダー削除) 拡張子を .txt に変更
output_file_path = "Orthogroups.GeneCount.SingleCopyWithOneZeroOrtholog.txt"
filtered_df.to_csv(output_file_path, sep="\t", index=False, header=False)
print(f"抽出されたデータが {output_file_path} に保存されました。")~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25下にSCOwithOneZero_Manualphylo_dataを作成、Manualphylo_1,2とalign.shを実行した。
kosukesano@at137:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ ls ManualPhylo_*
ManualPhylo_1.py ManualPhylo_2.py ManualPhylo_3.py### ManualPhylo_1.pyの実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ python ManualPhylo_1.py
### ManualPhylo_2.pyの実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ python ManualPhylo_2.py ~/tools/for_orthofinder/make_philo_tree/output_directory/all_seq.fa OG_list.txt
### align.shの実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ sh align.sh OG_list.txt0907
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data下でManualphylo_3.pyを実行した。
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ python3 ManualPhylo_3.py OG_list.txt species_list.txt同じディレクトリにmakealltree.shをコピーし、パスを書き換えて実行した。
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ cp ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data/makealltree.sh ../SCOwithOneZero_Manualphylo_data/
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ ls makealltree.sh
makealltree.sh
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ less makealltree.sh
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ nano makealltree.sh
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data$ qsub_beta makealltree.sh
Your job 26698378 ("makealltree.sh") has been submitted書き換えたmakealltree.shは以下の通り。
### makealltreeの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"
# 作業ディレクトリに移動
cd ~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data
# 出力ファイル
output_file="all_trees.nwk"
# 既存の出力ファイルを削除
if [ -f $output_file ]; then
rm $output_file
fi
# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.edit.fa; do
# ファイル名から拡張子を除いたベース名を取得
base_name=$(basename $file .maffted.trimed.edit.fa)
# Singularityを使用してIQ-TREEを実行して系統樹を作成
singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}
# 作成された系統樹ファイル (.treefile) を output_file に追加
if [ -f ${base_name}.treefile ]; then
echo -n "${base_name}: " >> $output_file
cat ${base_name}.treefile >> $output_file
echo "" >> $output_file
else
echo "Error: ${base_name}.treefile not found" >&2
fi
done
echo "All trees have been written to $output_file"
date0908
前日のジョブをまちがって上書きしてしまった……。
すべてqdelして改めてqsub
0909
IQ-TREEのジョブは無事できていた。最終出力のall_trees.nwkをASTRAL用のディレクトリにコピー。
~/tools/for_ASTRAL/Astral/data/6sp_withOneZeroというディレクトリを作りそこに格納。modify.pyで OG番号を切り取った。
これを使ってASTRALを実行。使ったスクリプトは以下の通り。
### ASTRAL.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 16
echo start at
date
java -Xmx2G -jar astral.5.7.8.jar \
-i /home/kosukesano/tools/for_ASTRAL/Astral/data/6sp_withOneZero/modified_trees.nwk \
-o /home/kosukesano/tools/for_ASTRAL/Astral/240909_result/out.tre \
2>/home/kosukesano/tools/for_ASTRAL/Astral/240909_result/out.log
date0910
240910_resultを作ってなかったのでエラー……。
mkdirで作り直してもう一度動かす。ついでに作業ノードで実行権限を付与して行ってみる。
kosukesano@at138:~/tools/for_ASTRAL/Astral$ mkdir 240910_result
kosukesano@at138:~/tools/for_ASTRAL/Astral$ nano ASTRAL.sh
kosukesano@at138:~/tools/for_ASTRAL/Astral$ chmod +x ASTRAL.sh
kosukesano@at138:~/tools/for_ASTRAL/Astral$ ./ASTRAL.sh
start at
Tue Sep 10 10:24:33 JST 2024
Tue Sep 10 10:24:34 JST 2024
kosukesano@at138:~/tools/for_ASTRAL/Astral1秒でできた。
出力ファイルもしっかりある。今後はジョブとして投げなくて良さそう。
kosukesano@at138:~/tools/for_ASTRAL/Astral/240910_result$ ls
out.log out.tre
kosukesano@at138:~/tools/for_ASTRAL/Astral/240910_result$出力ファイルout.txtをよく見てみる。
======== Running the main analysis
Number of taxa: 6 (6 species)
Taxa: [Agra, Cass, Dpon, Smad, Sory, Tcas]
Taxon occupancy: {Cass=1950, Sory=1950, Tcas=1518, Agra=1950, Smad=1950, Dpon=1950}
Number of gene trees: 1950
432 trees have missing taxa
Calculating quartet distance matrix (for completion of X)
Species tree distances calculated ...
Will attempt to complete bipartitions from X before adding using a distance matrix.
Building set of clusters (X) from gene trees Tcasしか欠失分が取れていない?
系統樹の描画
最終出力であるout.treを240910_ASTRAL.treとしてコピー、それを元に系統樹を描画。
tree4 = read.tree("/Users/kosukesano/bio/240910_ASTRAL.tre")
p=ggtree(tree4)+
xlim(0, 7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
p6種のSCOについて、欠失を1つまで許した時の系統樹
前の系統樹と比べると、マダラとキクイムシの位置がずれている。
out.txtの中にある系統樹もコピーし、240910_ASTRAL_Optimal_tree.treとして描画。
tree5= read.tree("/Users/kosukesano/bio/240910_ASTRAL_Optimal_tree.tre")
p=ggtree(tree5)+
xlim(0, 7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
pSCOの欠失部分がうまく取れていない件について
Tcasで欠失しているOG0010273を例にとって検証
Orthogroup Agra Cass Dpon Smad Sory Tcas Total
OG0010273 1 1 1 1 1 0 5
OG0010276 1 1 1 1 1 0 5
OG0010278 1 0 1 1 1 1 5### OG0010273.maffted.fnaの中身
>XP_050293049.1 XP_050293049.1 ras-related protein Rab-27A [Anthonomus grandis grandis]
--MEYDYLIKFLALGDSGVGKTSFLYQYTDSSFNSRFISTVGIDFREKRLIYQAKGRSYR
VHLQLWDTAGQERFRSLTTAFYRDAMGFILIFDLTNEQSFLEIRNWINQLRIHAYCDTPD
IVLCGNKADLEDRRVVSEWKAREFAELNGLPYLETSAATGQNVSRSIETLLERVMIRMET
AVDSAMLPTHRDNFRNPLRVGLNTNYSAQKCSC
>XP_019755005.1 XP_019755005.1 ras-related protein Rab-27A [Dendroctonus ponderosae]
MRMDYDYLIKFLALGDSGVGKTSFLYQYTDGTFNSRFISTVGIDFREKRLIYQSKGRNYR
VHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNERSFLEIRNWIEQLRVHAYCDTPD
IVLCGNKADIEDRRVVSEWKAREFAEINGLPYLETSAATGQNISRAIETLLEKVMYRMET
AVDMAMLPNRRGNPGDHSQIDLSAPSSAQKCLC
>g2477.t1 g2477.t1
--MEYDYLIKFLALGDSGVGKTSFLHQYTDGTFNSRFISTVGIDFREKRLVYQSKGRNYR
VHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEQSFLEIRNWIEQLRVHAYCDTPD
VILCGNKADLEDRRVITEWKAREFAESNGLPYLETSAATGQNVSRAIETLLEKVMYRMET
AVDMAMLPNRRGNLKEVLKVDLNASPSAQKCLC
>XP_030762023.1 XP_030762023.1 ras-related protein Rab-27A [Sitophilus oryzae]
--MDYDYLIKFLALGDSGVGKTSFLYQYTDGTFNSRFISTVGIDFREKRMIYQSKGRNYR
VHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEHSFLEIRNWIEQLRLHAYCDTPD
IVLCGNKADLEDRRVVTEWRAREFAEINGLPYLETSAATGQNVSRAVETLLEKVMLRMET
AVDMAMVPGQSGKFKDTGEFMLRSSSPAQKCTC
>XP_967715.1 XP_967715.1 PREDICTED: ras-related protein Rab-27A [Tribolium castaneum]
--MDYDYLIKFLALGDSGVGKTSFLYQYTDGLFNSRFISTVGIDFREKRLLYQSKGRNHR
VHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEQSFLEIRNWVEQLRLHAYCDCPD
VVLCGNKADLEDRRIITEWRAREMAEKLGLVYLETSAATGQNVSRAVETLLEKVMIRMET
AVDRAMLPGRRGRPRDPNDVDFNAP-PTHNCTCCass以外の5種の配列が記述されている。
### OG0010278.maffted.trimed.edit.faの中身
>Agra
MEYDYLIKFLALGDSGVGKTSFLYQYTDSSFNSRFISTVGIDFREKRLIYQAKGRSYRVHLQLWDTAGQERFRSLTTAFYRDAMGFILIFDLTNEQSFLEIRNWINQLRIHAYCDTPDIVLCGNKADLEDRRVVSEWKAREFAELNGLPYLETSAATGQNVSRSIETLLERVMIRMETAVDSAMLPTHRDNFRNPLRVGLNTNSAQKCSC
>Cass
MDYDYLIKFLALGDSGVGKTSFLYQYTDGTFNSRFISTVGIDFREKRLIYQSKGRNYRVHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNERSFLEIRNWIEQLRVHAYCDTPDIVLCGNKADIEDRRVVSEWKAREFAEINGLPYLETSAATGQNISRAIETLLEKVMYRMETAVDMAMLPNRRGNPGDHSQIDLSAPSAQKCLC
>Dpon
MEYDYLIKFLALGDSGVGKTSFLHQYTDGTFNSRFISTVGIDFREKRLVYQSKGRNYRVHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEQSFLEIRNWIEQLRVHAYCDTPDVILCGNKADLEDRRVITEWKAREFAESNGLPYLETSAATGQNVSRAIETLLEKVMYRMETAVDMAMLPNRRGNLKEVLKVDLNASSAQKCLC
>Smad
MDYDYLIKFLALGDSGVGKTSFLYQYTDGTFNSRFISTVGIDFREKRMIYQSKGRNYRVHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEHSFLEIRNWIEQLRLHAYCDTPDIVLCGNKADLEDRRVVTEWRAREFAEINGLPYLETSAATGQNVSRAVETLLEKVMLRMETAVDMAMVPGQSGKFKDTGEFMLRSSPAQKCTC
>Sory
MDYDYLIKFLALGDSGVGKTSFLYQYTDGLFNSRFISTVGIDFREKRLLYQSKGRNHRVHLQLWDTAGQERFRSLTTAFYRDAMGFLLLFDLTNEQSFLEIRNWVEQLRLHAYCDCPDVVLCGNKADLEDRRIITEWRAREMAEKLGLVYLETSAATGQNVSRAVETLLEKVMIRMETAVDRAMLPGRRGRPRDPNDVDFNAPPTHNCTCCassが入り、その分Tcasが消えてる。種名ラベルの記述がミスっている?
0911
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/下にtestディレクトリを作り、色々検証。
OG_list.txtとspecies_list.txt、Manualphylo_*ファイルをコピーした。
Manualphylo_2.pyの実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ python ManualPhylo_2.py ~/tools/for_orthofinder/make_philo_tree/output_directory/all_seq.fa OG_list.txtManualphylo_2.pyを実行すると、OG0008034のようなOGごとの配列が記録されたファイルができる。
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ ls
ManualPhylo_1.py ManualPhylo_3.py OG0008034 OG0008036 OG0008039 OG0008041 OG0008043 OG0008045 OG0008048 OG_list.txt species_list.txt
ManualPhylo_2.py OG0008033 OG0008035 OG0008037 OG0008040 OG0008042 OG0008044 OG0008046 OG0008049 align.sh
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ 上記のは途中でCtr+Cを押して終了させた。
OG0010273のみをコピーし、残りは消去。
align_shの実行
kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ source ~/tools/pyenv_env/ManualPhilo_profile
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ sh align.sh OG_list.txtalign_shを実行すると、*maffted.fa、*maffted.trimed.faができる。*maffted.faはヘッダーがフルで入っているが、*maffted.trimed.faはヘッダー部分は遺伝子IDのみになっている。
### OG0010273.maffted.faの中身
>XP_050292831.1 XP_050292831.1 uncharacterized protein LOC126733540 [Anthonomus grandis grandis]
MPFNVMIQLCNITRQSKLFQRNFYLPLLKPLSPKDALRIHVITGKDVLSKALEHWIPVLE
EYSARVARKRHMPKGKGKKRLRKKESMKYLMPFFDSGMDNHAKLENVCNRKSKGRGQCYT
YYIPEKKCTLVFTHQMERAIRDKDIIDVIIAQRHEKILVKMRDGTKVVMPLSPYNGKGPL
YRGEGWTRKDIEEYHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDTRD
WKEMLAATASNMDWDAFEEETKQIVEQANEPVNDEPIAMQLDNIDKTCDVIDKLKAAGEE
VLSLLPTMPEIPEILKVIKDQSELTEIANISGARVNLKASAERPGERFVPGQMVTSEEGD
LFVPGQTILNESGVKEYTPGFTVLLDNEPTLIPGLVMGNDPDKSMFLPGESTITESGELQ
FAATDEDILPHVDTPPP-EKEVEEIELEEEQNSEDEEIEQRPPPKPKRKELTYERPKREF
KTENMGPKRRVRGPKKVAEPAPAPAPPTLE--RRPTIIIEAKLFNLQTPTFEKDILEQQK
ERVEAFKEKTGKEEARLNKYRLELRMKAKKMRESLPPPPIYEPLEPVRKSEKLRELEKSI
KKGRFFEADHKKYITNEYTE-KFHWIDTYQYKKVFDTVGIMRHRVWKPVYS--H
>CAG9762300.1 CAG9762300.1 unnamed protein product [Ceutorhynchus assimilis]
MPFNVMIQLCNITRQS----------------------------KDVLAKALEHWIPVLE
DHASRVARRRHMPKGKGKKRQRKKESMRYLTPFFDEMMDNHGRLENMIIRKPKGRGQCYT
YYIPEKKCTLVLTHQFEKAIRDKDIVDIVIAQRHEKIIVKLRDGLKVVMPLSPYSGKAPF
YRGEGWTKKDIEEFHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDTRD
WKEMLAATTENMDWDAFEEESRQIVEEANEPVEDDPIAMQLNDIEKTCDVTEKLKAAGED
VMSLLPKMPEIPEILKIFKDHSELTEIANVSGARVNLATGT----HRFVPGQMVTSEDGE
IFVPGQTVLTESGEREYTPGFTVLLEDEPTLIPGLVMGNDPEKTMFLPGEATITESGELQ
FGVNEDDIVP--SLTPPFEKEVEEIELEEEQNSEEEEIEQRPPPK--KKELTYERPKREF
NQEKMGPKHRVRGPKKLPPVVSPPEKDLAD--RRKTIVPDTKLFDLTTQTFEKDFLEQEK
ERVEAFKEKTGKEEAKVDKQRREIKLMVKKMRDSLPPKPKYQPLEPVRKSEKLRDMEKSI
KKGKFFEVDYKKWLTKENNHEPFHWMDTYQYKKTFDSVGIIRHRIWKSVY----
>XP_019758826.1 XP_019758826.1 uncharacterized protein LOC109536854 [Dendroctonus ponderosae]
MPFNVMIQLCNITRQSKLFQRNFFLPLLKPLSPKDALRIHIITGKDVLVKAIEHWIPVLE
EYACRVQRRRHMPKGKGKKRQRKKESMKYVMPFFDDMMENHPKLENLIIRKPKGRGQCYT
YYIPEKKCTLVLTYQMEKAIRDKDIVDIVIAQRHEKILVKMRDGTKVVMPLCPYNGHAPF
YRGEGWTRKDIEEYHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDSRD
WKDMLNSTVNNMDWDAFEEESKQIVAESNDPVDDEPISMQLDDIEKTCDVNEKLKAAGDE
IMSLLPTMPEIPQILKVMKDQSEFTEIANISGARVSLSAGS----DRFVPGQMVTSEDGE
LFVPGQTVVNESGDKEYTPGFTVLLDNEPTLIPGLVMGNDPDKSMFLPGESTITEAGELQ
FAATEDDIIPHPPTPPPEEKEVEEVELEEDQNSEEEEVEQRPPPKRERKELTYERPKREF
NTENMGPKHRVRGPKKVAPVV-IKTEETPDPVRRKT-IIDAKIFDLQTPTFEKDFLEQEK
ERVEAFKEKSGKEEAKVDKQRREIKLKVKKLVDSRPPPPKYEPLEPVRKSEKLREFEKSI
KRGKFFDVDYKKYLTKEYTG-QFHWLDTYQYRNTFDTVGIMRHRIWKSVY----
>g9568.t1 g9568.t1
MPFNVMIQLCSITRQSKLFQRNFYLPLLKPLSPKDALRIHIITGKDVLAKALEHWIPVLE
EYASRVARRRHMPKGRGKKRQRKKDSMKYVCPFFDDNMDNHGRLENMVNRKSKGRGQCYT
YYIPEKKCTLVVTHQMERAIRDKDIVDIVIAQRQEKILVKMKDGTKVVMPLCPYNGRNPL
YRGEGWTKKEIEEYHHHGRDTFSIAQLFEAKEAGIEEWELEMMRMANKRKKKIKGEDTKD
WKEMLSTTMQNMDWDAFEEESKQIVEEANEPVDDEPIPMQLNDMDKTCDVIEKLKAAGDD
VLKLLPVMPEIPEILKVIQDQSEFTEIASISGARVSLTSGS----ERFVPGQMVTSEEGE
LFVPGQTTVSESGEKEYTPGFTVLLDNEPTLIPGLVMGNDPEKSMFLPGESTITESGELQ
FAATEDDILPYQPAPPSEEKEVEEVELEEEQNSEEEEIEQRPPPKREKKEFTYERPKREF
NPESMGPKHRVRGPKKVPPMVQAPAEPTPDPARRKT-VVEVKIFDLQTPTFEKDFLEQEK
ERVEAFKEKSGKEEAKVDKQRREIKMMAKKIIDSSPRVVKYEPLEPVKKSEKLREFEKSI
KKGNFFDVDYKKWLSRNHKE-QFHWADTYQYRNTFDTVGIMRHRVWKSVYSSRK
>XP_030763397.1 XP_030763397.1 uncharacterized protein LOC115887965 [Sitophilus oryzae]
MPFNVMIQLCRVTRASKLFQRNFYLPLLKPLSPKDALRRHIITGKDVLQKALEHWIPVLE
EYAARVQRRRHAPKGRGKKRQRRKESMKYVMPFFDDTLPSHPKLENLVARKSKGRGQCYT
YHIPEKKCTLVLTHQMERAIRDKDIVDIVIAQRHEKIIVKMRDGTKVTMPLCPYEGRAPL
YRGEGWTRKDIEEFHHHGHETFSIAQLFEAKEAGIEEWELEMMRLASQRKKKMKGEGTQD
WKAMLQTTVENMDWEEFEEDAKQIVTEVNEVVEDEPIAMQVDDMELTCDVNEKLKAAGAD
VLALLPSMPEIPQLLRLLSGQSELTQVAKVSGARVSLDAGS----DRFVPGQLVASEEGE
LFVPGQTVLTEAGEKEYTPGFTVMMDGEPTLIPGLVMGNDPNKAMFLPGESTITGGGELQ
FAASADDVLVNEPLPPP-VEEPEEAELDEDQNSVEEEIEMRPPPKRERKEFVYERPKRQY
DVESMGPKHRERGPKRLPAALQAAANEPPP--APKP-FVPVKMIEFTPPVFEKDLLEQEK
ERVAAMKEKTGKEEAKVDKTRREIRMRAKNLMDSRPPPPKYEPLEPVRKSEKLREMERSI
KQGAFFDTDYKKYLVRERNSWPVNWLEKYQYRNTFDTVGIQRHRVWKSVF----### OG0010273.maffted.trimed.faの中身
>XP_050292831.1
MPFNVMIQLCNITRQSKLFQRNFYLPLLKPLSPKDALRIHVITGKDVLSKALEHWIPVLE
EYSARVARKRHMPKGKGKKRLRKKESMKYLMPFFDSGMDNHAKLENVCNRKSKGRGQCYT
YYIPEKKCTLVFTHQMERAIRDKDIIDVIIAQRHEKILVKMRDGTKVVMPLSPYNGKGPL
YRGEGWTRKDIEEYHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDTRD
WKEMLAATASNMDWDAFEEETKQIVEQANEPVNDEPIAMQLDNIDKTCDVIDKLKAAGEE
VLSLLPTMPEIPEILKVIKDQSELTEIANISGARVNLKASAERFVPGQMVTSEEGDLFVP
GQTILNESGVKEYTPGFTVLLDNEPTLIPGLVMGNDPDKSMFLPGESTITESGELQFAAT
DEDILPHVDTPPPEKEVEEIELEEEQNSEDEEIEQRPPPKPKRKELTYERPKREFKTENM
GPKRRVRGPKKVAEPAPAPAPPTLERRPTIIEAKLFNLQTPTFEKDILEQQKERVEAFKE
KTGKEEARLNKYRLELRMKAKKMRESLPPPPIYEPLEPVRKSEKLRELEKSIKKGRFFEA
DHKKYITNEYTEKFHWIDTYQYKKVFDTVGIMRHRVWKPVY
>CAG9762300.1
MPFNVMIQLCNITRQS----------------------------KDVLAKALEHWIPVLE
DHASRVARRRHMPKGKGKKRQRKKESMRYLTPFFDEMMDNHGRLENMIIRKPKGRGQCYT
YYIPEKKCTLVLTHQFEKAIRDKDIVDIVIAQRHEKIIVKLRDGLKVVMPLSPYSGKAPF
YRGEGWTKKDIEEFHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDTRD
WKEMLAATTENMDWDAFEEESRQIVEEANEPVEDDPIAMQLNDIEKTCDVTEKLKAAGED
VMSLLPKMPEIPEILKIFKDHSELTEIANVSGARVNLATGTHRFVPGQMVTSEDGEIFVP
GQTVLTESGEREYTPGFTVLLEDEPTLIPGLVMGNDPEKTMFLPGEATITESGELQFGVN
EDDIVP--SLTPPEKEVEEIELEEEQNSEEEEIEQRPPPK--KKELTYERPKREFNQEKM
GPKHRVRGPKKLPPVVSPPEKDLADRRKTVPDTKLFDLTTQTFEKDFLEQEKERVEAFKE
KTGKEEAKVDKQRREIKLMVKKMRDSLPPKPKYQPLEPVRKSEKLRDMEKSIKKGKFFEV
DYKKWLTKENNHPFHWMDTYQYKKTFDSVGIIRHRIWKSVY
>XP_019758826.1
MPFNVMIQLCNITRQSKLFQRNFFLPLLKPLSPKDALRIHIITGKDVLVKAIEHWIPVLE
EYACRVQRRRHMPKGKGKKRQRKKESMKYVMPFFDDMMENHPKLENLIIRKPKGRGQCYT
YYIPEKKCTLVLTYQMEKAIRDKDIVDIVIAQRHEKILVKMRDGTKVVMPLCPYNGHAPF
YRGEGWTRKDIEEYHHHGRETFSIAQLFEAKEAGIEEWELEMMRMANKRKKKMKGEDSRD
WKDMLNSTVNNMDWDAFEEESKQIVAESNDPVDDEPISMQLDDIEKTCDVNEKLKAAGDE
IMSLLPTMPEIPQILKVMKDQSEFTEIANISGARVSLSAGSDRFVPGQMVTSEDGELFVP
GQTVVNESGDKEYTPGFTVLLDNEPTLIPGLVMGNDPDKSMFLPGESTITEAGELQFAAT
EDDIIPHPPTPPPEKEVEEVELEEDQNSEEEEVEQRPPPKRERKELTYERPKREFNTENM
GPKHRVRGPKKVAPVV-IKTEETPDRRKTIIDAKIFDLQTPTFEKDFLEQEKERVEAFKE
KSGKEEAKVDKQRREIKLKVKKLVDSRPPPPKYEPLEPVRKSEKLREFEKSIKRGKFFDV
DYKKYLTKEYTGQFHWLDTYQYRNTFDTVGIMRHRIWKSVY
>g9568.t1
MPFNVMIQLCSITRQSKLFQRNFYLPLLKPLSPKDALRIHIITGKDVLAKALEHWIPVLE
EYASRVARRRHMPKGRGKKRQRKKDSMKYVCPFFDDNMDNHGRLENMVNRKSKGRGQCYT
YYIPEKKCTLVVTHQMERAIRDKDIVDIVIAQRQEKILVKMKDGTKVVMPLCPYNGRNPL
YRGEGWTKKEIEEYHHHGRDTFSIAQLFEAKEAGIEEWELEMMRMANKRKKKIKGEDTKD
WKEMLSTTMQNMDWDAFEEESKQIVEEANEPVDDEPIPMQLNDMDKTCDVIEKLKAAGDD
VLKLLPVMPEIPEILKVIQDQSEFTEIASISGARVSLTSGSERFVPGQMVTSEEGELFVP
GQTTVSESGEKEYTPGFTVLLDNEPTLIPGLVMGNDPEKSMFLPGESTITESGELQFAAT
EDDILPYQPAPPSEKEVEEVELEEEQNSEEEEIEQRPPPKREKKEFTYERPKREFNPESM
GPKHRVRGPKKVPPMVQAPAEPTPDRRKTVVEVKIFDLQTPTFEKDFLEQEKERVEAFKE
KSGKEEAKVDKQRREIKMMAKKIIDSSPRVVKYEPLEPVKKSEKLREFEKSIKKGNFFDV
DYKKWLSRNHKEQFHWADTYQYRNTFDTVGIMRHRVWKSVY
>XP_030763397.1
MPFNVMIQLCRVTRASKLFQRNFYLPLLKPLSPKDALRRHIITGKDVLQKALEHWIPVLE
EYAARVQRRRHAPKGRGKKRQRRKESMKYVMPFFDDTLPSHPKLENLVARKSKGRGQCYT
YHIPEKKCTLVLTHQMERAIRDKDIVDIVIAQRHEKIIVKMRDGTKVTMPLCPYEGRAPL
YRGEGWTRKDIEEFHHHGHETFSIAQLFEAKEAGIEEWELEMMRLASQRKKKMKGEGTQD
WKAMLQTTVENMDWEEFEEDAKQIVTEVNEVVEDEPIAMQVDDMELTCDVNEKLKAAGAD
VLALLPSMPEIPQLLRLLSGQSELTQVAKVSGARVSLDAGSDRFVPGQLVASEEGELFVP
GQTVLTEAGEKEYTPGFTVMMDGEPTLIPGLVMGNDPNKAMFLPGESTITGGGELQFAAS
ADDVLVNEPLPPPVEEPEEAELDEDQNSVEEEIEMRPPPKRERKEFVYERPKRQYDVESM
GPKHRERGPKRLPAALQAAANEPPPAPKPFVPVKMIEFTPPVFEKDLLEQEKERVAAMKE
KTGKEEAKVDKTRREIRMRAKNLMDSRPPPPKYEPLEPVRKSEKLREMERSIKQGAFFDT
DYKKYLVRERNSPVNWLEKYQYRNTFDTVGIQRHRVWKSVFManualphylo_3.pyの実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/test$ python3 ManualPhylo_3.py OG_list2.txt species_list.txtManualphylo_3.pyを実行すると、*.maffted.trimed.edit.faができる。
対策
*.maffted.trimed.faのヘッダー行を遺伝子IDにせず、元のヘッダーを保持させるalign.shのコード内で-keepheaderオプションを追加し、ヘッダー行を保持させる。これによりヘッダー行に種名の情報を残す。
Manualphylo_3.pyをspecies_list.txtに頼らないよう書き替えManualphylo_3.pyで行っているヘッダー行の書き換えについて、種名を元のヘッダーから取得するように変更。
変更後のalign.shはこちら
#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
mafft --auto $x > $x.maffted.fa
trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -keepheader -htmlout $x.maffted.trimed.fa.html -automated1
done新規スクリプトNew_Manualphylo_3.shはこちら
import os
from Bio import SeqIO
# 処理するディレクトリのパス
input_dir = '~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data/'
input_dir = os.path.expanduser(input_dir)
# ディレクトリ内のファイルをリスト
files = [f for f in os.listdir(input_dir) if f.endswith('.maffted.trimed.fa')]
# 各ファイルに対して処理を適用
for file in files:
input_file = os.path.join(input_dir, file)
# 出力ファイルのパス
og_number = file.split('.')[0] # OG番号を取得
output_file = os.path.join(input_dir, f"{og_number}.maffted.trimed.edit.fa")
# ファイルの読み込みと書き換え
with open(output_file, 'w') as outfile:
for record in SeqIO.parse(input_file, 'fasta'):
header = record.description
seq = str(record.seq)
# ヘッダーが「>g」で始まる場合、「>Smad」に置き換え
if header.startswith("g"):
new_header = ">Smad"
# ヘッダーが「]」で終わる場合、指定の形式に変換
elif header.endswith("]"):
# 「[]」内の最初の1文字とスペース後の3文字を抽出
within_brackets = header.split('[')[1].split(']')[0]
first_letter = within_brackets[0]
space_after = within_brackets.split()[-1][:3]
new_header = f">{first_letter}{space_after}"
else:
new_header = header
# 新しいヘッダーと配列を出力ファイルに書き込む
outfile.write(f"{new_header}\n{seq}\n")
# ジョブの進行状況を出力
print(f"Processed: {file}")
print("全てのファイルが処理されました。")これを行って*maffted.trimed.edit.faを新しく作り直したのち、makealltree.shを改めてジョブとして投げた。
Checkpoint (OG0008033.ckp.gz) indicates that a previous run successfully finished
Use -redo option if you really want to redo the analysis and overwrite all output files.
Use --redo-tree option if you want to restore ModelFinder and only redo tree search.
Use --undo option if you want to continue previous run when changing/adding options.ジョブを投げたら「それもうすでにやってるけど上書きしていい?」って警告が出た。上書きしていいように-redoのオプションをつけて再度実行。
0912
ショウジョウバエを含めたASTRAL出力の系統樹の描画
そういえばOrthofinderのみでこっちはしてなかったのでここに記載
tree6 = read.tree("/Users/kosukesano/bio/240912_ASTRAL.tre")
p=ggtree(tree6)+
xlim(0, 7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
ptree7= read.tree("/Users/kosukesano/bio/240912_ASTRAL_Optimal_tree.tre")
p=ggtree(tree7)+
xlim(0, 7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
pショウジョウバエがあり得ない位置にいる!?
OG番号から配列を取ってくる際に種名をつけるところをミスっているのでは?昨日のNew_Manualphylo_3.pyをこれにも適用してもう一度やってみる。
### align.shの中身
#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
mafft --auto $x > $x.maffted.fa
trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -keepheader -htmlout $x.maffted.trimed.fa.html -automated1
done###New_Maunalphylo_3.pyの中身
import os
from Bio import SeqIO
# 処理するディレクトリのパス
input_dir = '~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_data'
input_dir = os.path.expanduser(input_dir)
# ディレクトリ内のファイルをリスト
files = [f for f in os.listdir(input_dir) if f.endswith('.maffted.trimed.fa')]
# 各ファイルに対して処理を適用
for file in files:
input_file = os.path.join(input_dir, file)
# 出力ファイルのパス
og_number = file.split('.')[0] # OG番号を取得
output_file = os.path.join(input_dir, f"{og_number}.maffted.trimed.edit.fa")
# ファイルの読み込みと書き換え
with open(output_file, 'w') as outfile:
for record in SeqIO.parse(input_file, 'fasta'):
header = record.description
seq = str(record.seq)
# ヘッダーが「>g」で始まる場合、「>Smad」に置き換え
if header.startswith("g"):
new_header = ">Smad"
# ヘッダーが「]」で終わる場合、指定の形式に変換
elif header.endswith("]"):
# 「[]」内の最初の1文字とスペース後の3文字を抽出
within_brackets = header.split('[')[1].split(']')[0]
first_letter = within_brackets[0]
space_after = within_brackets.split()[-1][:3]
new_header = f">{first_letter}{space_after}"
else:
new_header = header
# 新しいヘッダーと配列を出力ファイルに書き込む
outfile.write(f"{new_header}\n{seq}\n")
# ジョブの進行状況を出力
print(f"Processed: {file}")
print("全てのファイルが処理されました。")これをどれも実行した上で、改めてmakealltree.shをqsubで投げた。
0917
New_Manualphylo_3.pyを使用した昆虫6種のASTRAL
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/SCOwithOneZero_Manualphylo_data下でNew_Manualphylo_3.pyを実行し、その上でIQ-TREEにかけた結果が出てきた。
この最終出力であるall_trees.nwkを~/tools/for_ASTRAL/Astral/data/new_6sp_withOneZero/にコピー。
注意!
元のヘッダーで種名以外の部分に[]が使われており、uubiという種が新たに創造されていた。2つ3つだったのでnanoで手動編集し修正した。
このファイルをmodify.pyにかけ、OG番号を削除。ASTRALにかける。
出力ファイルは~/tools/for_ASTRAL/Astral/240917_6sp_withOneZero_resultに格納。その中身は以下の通り。
### ~/tools/for_ASTRAL/Astral/240917_6sp_withOneZero_result/out.logの中身の一部
======== Running the main analysis
Number of taxa: 6 (6 species)
Taxa: [Agra, Cass, Dpon, Smad, Sory, Tcas]
Taxon occupancy: {Cass=1837, Sory=1902, Tcas=1856, Agra=1921, Smad=1843, Dpon=1909}
Number of gene trees: 1950
432 trees have missing taxa
Calculating quartet distance matrix (for completion of X)
Species tree distances calculated ...
Will attempt to complete bipartitions from X before adding using a distance matrix.
Building set of clusters (X) from gene trees 各種に欠損がちびちび入ってるから無事取れてそう?
これにより描かれた系統樹は以下の通り。
tree8= read.tree("/Users/kosukesano/bio/240917_6sp_withOneZero_ASTRAL.tre")
p=ggtree(tree8)+
xlim(0, 7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
p=flip(p, 2, 3)
p=flip(p, 1, 8)
pAgraが変なとこにいるのは変わらない.
New_Manualphylo_3.pyを使用した昆虫7種のASTRAL
ショウジョウバエも入れた時の結果。
all_trees.nwkを~/tools/for_ASTRAL/Astral/data/new_7sp/にコピー。
注意!
これも同様に元のヘッダーで種名以外の部分に[]が使われており、uubiという種が新たに創造されていた。また__という種もあった。2つ3つだったのでnanoで手動編集し修正した。
このファイルをmodify.pyにかけ、OG番号を削除。ASTRALにかける。出力ファイルは~/tools/for_ASTRAL/Astral/240917_7sp_resultに格納。その中身は以下の通り。
### ~/tools/for_ASTRAL/Astral/240917_7sp_result/out.logの中身の一部
======== Running the main analysis
Number of taxa: 7 (7 species)
Taxa: [Agra, Cass, Dpon, Dmel, Tcas, Sory, Smad]
Taxon occupancy: {Dmel=630, Cass=630, Sory=630, Tcas=630, Agra=630, Smad=630, Dpon=630}
Number of gene trees: 630
0 trees have missing taxa
Calculating quartet distance matrix (for completion of X)
Species tree distances calculated ...
Building set of clusters (X) from gene trees
------------------------------これにより描かれた系統樹は以下の通り。
tree9= read.tree("/Users/kosukesano/bio/240917_7sp_ASTRAL.tre")
p=ggtree(tree9)+
xlim(0, 7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
p=flip(p, 2, 11)
p=flip(p, 1, 9)
pSmad・Cass・Dponのクレード関係は先行研究と違ってるね。Agraはそもそも元データがダメか?
6種の昆虫のミトコンドリアCO1遺伝子での系統樹推定
~/tools/for_orthofinder/CO1_6sp/dataディレクトリを作成。
その下でSmad.fastaを作成した。参照元はこちら
Cass.fasta参照元はこちら
Dpon.fasta参照元はこちら
Agra.fasta参照元はこちら
Sory.fasta参照元はこちら
Tcas.fasta参照元はこちら
これを1つにまとめたファイル、COI.fastaを~/tools/for_orthofinder/CO1_6spに作成した。
これについて、align.shを実行した。align.shのスクリプトは以下の通り。
### align.shの中身
#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
# COI.fasta に対してMAFFTとTrimAlを実行
mafft --auto COI.fasta > COI.maffted.fa
trimal -in COI.maffted.fa -out COI.maffted.trimed.fa -htmlout COI.maffted.trimed.fa.html -automated1さらに、これについてIQ-TREEで遺伝子系統樹を描いた。実行スクリプトIQ_TREE.shは以下の通り。
### IQ_TREE.shの中身
#!/bin/bash
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"
# 処理するファイルを指定
file="COI.maffted.trimed.fa"
# ファイル名から拡張子を除いたベース名を取得
base_name=$(basename $file .maffted.trimed.fa)
# Singularityを使用してIQ-TREEを実行して系統樹を作成
singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}
# 作成された系統樹ファイル (.treefile) を確認
if [ -f ${base_name}.treefile ]; then
echo "Tree for ${base_name} has been successfully created."
else
echo "Error: ${base_name}.treefile not found" >&2
fi
echo "Process completed."
dateこれをqsubで投げた。
元データのヘッダーを書き換えた状態での系統樹推定
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dirディレクトリから.fastaファイルを取得し、「種名」+「遺伝子ID」のみに書き換えて~/tools/for_orthofinder/RemakeHedder_6spディレクトリに保存するスクリプトedit.pyを~/tools/for_orthofinder/RemakeHedder_6spの下で作成。
### edit.pyの中身
import os
from Bio import SeqIO
# 入力ディレクトリと出力ディレクトリのパス
input_dir = '../Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/'
output_dir = '../RemakeHedder_6sp/'
# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 入力ディレクトリ内のすべての .fasta ファイルを処理
for input_file in os.listdir(input_dir):
if input_file.endswith('.fasta'):
input_path = os.path.join(input_dir, input_file)
output_path = os.path.join(output_dir, input_file)
# 入力ファイルを読み込み、条件に基づいて書き換えた内容を出力ファイルに保存
with open(output_path, 'w') as outfile:
for record in SeqIO.parse(input_path, 'fasta'):
header = record.description
seq = str(record.seq)
# ヘッダーが「g」で始まる場合
if header.startswith("g"):
# 新しいヘッダーは「>Smad」 + 「元のヘッダーの番号」
number = header.split()[0] # ヘッダーの最初の番号部分を取得
new_header = f">Smad_{number}"
# ヘッダーが「]」で終わる場合
elif header.endswith("]"):
# ヘッダーの最後の「[]」内の英字を抽出
within_brackets = header.split('[')[-1].split(']')[0]
first_letter = within_brackets[0] # 最初の1文字
space_after = within_brackets.split()[-1][:3] # スペース後の3文字
# 元のヘッダーから最初の「>」の次の文字から最初の「 」までの部分を取得
first_part = header.split()[0][1:]
new_header = f">{first_letter}{space_after}_{first_part}"
else:
new_header = f">{header.split()[0]}"
# 新しいヘッダーと配列を出力ファイルに書き込む
outfile.write(f"{new_header}\n{seq}\n")
print(f"{output_path} に保存しました。")これを実行。
(MPT) kosukesano@at137:~/tools/for_orthofinder/RemakeHedder_6sp$ python edit.py
../RemakeHedder_6sp/Tcas.fasta に保存しました。
../RemakeHedder_6sp/Agra.fasta に保存しました。
../RemakeHedder_6sp/Smad.fasta に保存しました。
../RemakeHedder_6sp/Cass.fasta に保存しました。
../RemakeHedder_6sp/Dpon.fasta に保存しました。
../RemakeHedder_6sp/Sory.fasta に保存しました。
(MPT) kosukesano@at137:~/tools/for_orthofinder/RemakeHedder_6sp$ ls
Agra.fasta Cass.fasta Dpon.fasta Smad.fasta Sory.fasta Tcas.fasta edit.py~/tools/for_orthofinderでOrthofinderの実行スクリプトOrthofinder_240917_RH.shを記述。
### Orthofinder_240917_RH.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 5
#$ -l medium
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder -f ~/tools/for_orthofinder/RemakeHedder_6sp -t 5 -a 5
dateこれをqsubで投げた。
Pissodes strobiのソフトマスク
生データは~/tools/for_softmask/nama_data/Pstr_dataに格納した。
(MPT) kosukesano@at137:~/tools/for_softmask/nama_data/Pstr_data$ ls
GCA_016904865.1 assembly_data_report.jsonl data_summary.tsv dataset_catalog.json
(MPT) kosukesano@at137:~/tools/for_softmask/nama_data/Pstr_data$ ソフトマスク用のディレクトリ~/tools/for_softmask/Pstr_softmaskを作成。
(MPT) kosukesano@at137:~/tools/for_softmask$ mkdir Pstr_softmask
(MPT) kosukesano@at137:~/tools/for_softmask$ cd Pstr_softmask/
(MPT) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ ls
(MPT) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ source ~/tools/pyenv_env/EDTA_profileデータベースの構築
(MPT) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ source ~/tools/pyenv_env/EDTA_profile
(EDTA2) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ BuildDatabase -name Pstr_BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/GCA_016904865.1_GSC_weevil_1.0_genomic.fna
Building database Pstr_BLAST_DATABASE_PREFIX:
Reading /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/GCA_016904865.1_GSC_weevil_1.0_genomic.fna...
Number of sequences (bp) added to database: 84140 ( 2025024129 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Pstr_softmask$ RepeatModelerの実行
Pstr_RepeatModeler.shを作成し、qsubで投げた。シェルスクリプトの中身は以下の通り。
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Pstr_BLAST_DATABASE_PREFIX -pa 6
date#### Elaeidobius kamerunicusのソフトマスク
生データは~/tools/for_softmask/nama_data/Ekam_dataに格納した。
ソフトマスク用のディレクトリ~/tools/for_softmask/Ekam_softmaskを作成。
データベースの構築
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ekam_softmask$ BuildDatabase -name Ekam_BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna
Building database Ekam_BLAST_DATABASE_PREFIX:
Reading /home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna...
Number of sequences (bp) added to database: 364527 ( 269635327 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ekam_softmask$ RepeatModelerの実行
Ekam_RepeatModeler.shを作成し、qsubで投げた。シェルスクリプトの中身は以下の通り。
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Ekam_BLAST_DATABASE_PREFIX -pa 6
date0918
6種の昆虫ゲノムについて、all_seq.faのヘッダー行を書き換えて再度系統樹作成
OG番号から種ごとに配列を取ってくる際、何かやらかしているのでは?そこのケア。
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/OrthoFinder/Results_Jun25/RenameHedderというディレクトリを作り、そこで作業を行う。
まず新しいOG_list.txtを作る。前のOG_list.txtは種名がわからなかったので種名もつけるようにする。それを実行するスクリプトはmakeOGlist.py。
### makeOGlist.pyの中身
# ファイルを読み込む
input_file = '../ManualPhylo_data/OG_list.txt'
output_file = '../RenameHedder/New_OG_list.txt'
# 種の接頭辞をリストで定義
prefixes = ['Agra_', 'Cass_', 'Dpon_', 'Smad_', 'Sory_', 'Tcas_']
# 出力ファイルに書き込み
with open(input_file, 'r') as fin, open(output_file, 'w') as fout:
for line in fin:
parts = line.strip().split() # 各行をスペースで区切る
new_line = parts[0] + ' ' + ' '.join([f'{prefixes[i]}{parts[i+1]}' for i in range(6)]) # プレフィックスを追加
fout.write(new_line + '\n') # 新しい行をファイルに書き込む最終出力はNew_OG_list.txt
次に新しいallseq.faを作る。こちらはヘッダーにいらないアノテーションの説明などが入っているので、遺伝子IDと種名のみにする。その実行スクリプトedit_allseq.pyは以下の通り。
### edit_allseq.pyの中身
GNU nano 6.2 edit_allseq.py
from Bio import SeqIO
# 入力ファイルと出力ファイルのパス
input_file = '../../../../make_philo_tree/output_directory/all_seq.fa'
output_file = 'new_all_seq.fa'
# 入力ファイルを読み込み、条件に基づいて書き換えた内容を出力ファイルに保存
with open(output_file, 'w') as outfile:
for record in SeqIO.parse(input_file, 'fasta'):
header = record.description
seq = str(record.seq)
# ヘッダーが「>g」で始まる場合
if header.startswith("g"):
# 新しいヘッダーは「>Smad」 + 「元のヘッダーの番号」
number = header.split()[0] # ヘッダーの最初の番号部分を取得
new_header = f">Smad_{number}"
# ヘッダーが「]」で終わる場合
elif header.endswith("]"):
# ヘッダーの最後の「[]」内の英字を抽出
within_brackets = header.split('[')[-1].split(']')[0]
first_letter = within_brackets[0] # 最初の1文字
space_after = within_brackets.split()[-1][:3] # スペース後の3文字
# 元のヘッダーから最初の「>」の次の文字から最初の「 」までの部分を取得
first_part = header.split()[0][0:]
new_header = f">{first_letter}{space_after}_{first_part}"
else:
new_header = f">{header.split()[0]}"
# 新しいヘッダーと配列を出力ファイルに書き込む
outfile.write(f"{new_header}\n{seq}\n")
print(f"{output_file} に保存しました。")これの最終出力はnew_all_seq.fa
続いて各OGごとに.fastaファイルを作る。その実行スクリプトNew_Manualphylo_2.pyは以下の通り。
### New_Manualphylo_2.pyの中身
# ライブラリのインポート
import os
# ファイルのパス
og_list_file = "New_OG_list.txt"
sequence_file = "new_all_seq.fa"
# sequence_fileをメモリに読み込む
with open(sequence_file, "r") as seq_file:
sequences = seq_file.read().splitlines()
# ヘッダーとシーケンスのマッピング
seq_dict = {}
header = ""
for line in sequences:
if line.startswith(">"):
header = line[1:].strip() # ヘッダー行('>'を除く)
seq_dict[header] = ""
else:
seq_dict[header] += line.strip()
# New_OG_list.txtを処理
with open(og_list_file, "r") as og_file:
og_lines = og_file.readlines()
# 1行ごとに処理を行う
for og_line in og_lines:
og_data = og_line.strip().split() # 空白で区切る
og_number = og_data[0] # OG番号
gene_ids = og_data[1:] # 遺伝子IDのリスト
# 出力ファイルを作成
output_file = f"{og_number}.fa"
with open(output_file, "w") as out_file:
# 各遺伝子IDについて処理
for gene_id in gene_ids:
if gene_id in seq_dict:
# 一致したヘッダーと配列データを書き込む
out_file.write(f">{gene_id}\n")
out_file.write(f"{seq_dict[gene_id]}\n")
# 経過報告
print(f"{og_number}.fa の作成が完了しました。")
print("すべての処理が完了しました。")続いて各OGのファイルにMAFFTをかける。そのスクリプトalign.shは以下の通り。
### align.shの中身
#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
mafft --auto $x.fa > $x.maffted.fa
trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -htmlout $x.maffted.trimed.fa.html -automated1
doneMAFFTによって得られたファイルをIQ-TREEの入力に沿うよう、遺伝子IDの部分を切る。そのスクリプトManualphylo_4.pyは以下の通り。
### Manualphylo_4.pyの中身
import os
# ファイルのヘッダーを変更する関数
def modify_headers(input_file, output_file):
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
if line.startswith(">"):
# ヘッダー行の最初の四文字を抽出して書き換え
outfile.write(f">{line[1:5]}\n")
else:
outfile.write(line)
# 作業ディレクトリ内のすべての ".maffted.trimed.fa" ファイルに対して処理を適用
def process_directory(directory):
for filename in os.listdir(directory):
if filename.endswith(".maffted.trimed.fa"):
input_file = os.path.join(directory, filename)
output_file = os.path.join(directory, filename.replace(".maffted.trimed.fa", ".maffted.trimed.edit.fa"))
modify_headers(input_file, output_file)
print(f"Processed: {filename}")
# 実行するディレクトリを指定(例:カレントディレクトリ)
process_directory(".")これを実行した後にmakealltree.shを作成、ジョブとして投げた。
ミトコンドリア系統樹続き
ジョブが帰ってきたので中身を見てみる。
### ~/tools/for_orthofinder/CO1_6sp/240918_test/COI.treefile
(Agra:0.0848824451,Cass:0.1343636812,((Dpon:0.0000020989,Tcas:8.4017171493)19:0.1500110432,(Smad:0.1408388478,Sory:0.1744199629)36:0.0439910356)25:0.0494856164);系統樹は以下の通り
tree10= read.tree("/Users/kosukesano/bio/240917_CO1.tre")
p=ggtree(tree10, branch.length = 'none')+
xlim(0, 5)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
pなんだこれ……。
トリミングした後のファイルを見てみる。
### COI.maffted.trimed.faの中身
>Agra
------------------------------------------------------------
----------tttaattttaagaagaattgtagaaaaggagctggaacaggatgaacagt
ttaccctccactttcttctaatttagctcatgaaggacttctgttgatttagctattttt
agccttcatatagccgggatttcttcaattctcggagctataaattttatttcaacagta
aatatacctcaagaagtagagcaaatacctttattttgagctgtaaaaattacagctatc
ttattactaatttctcttccagtcttagcaggggctatta-ctatactactaactgaccg
taatattaatacatcattttttgatccgcaggtggtggagacccaattctctatcaacac
ttatttcccagaagtttatattctaat-------
>Cass
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------tccagaagtatatattttaataaaaaaa
>Dpon
------------------------------------------------------------
------------------------------------------------------------
------------------------------------------------------------
------------------gtatcctctatccttggagctatcaattttatttctacaata
aatatacttcaggaattagatcgtttaacgttattttgagcagtaaaaattacagctatc
ttattattgttatcattaccagtattagctggagccatca-ctatacttttaacagaccg
aaatatcaatactactttttttgantcyccggtggaggagatcctattctctatcaacac
ttatttccccgaagtttacattttaataaaaaag
>Smad
ataatacttcattaaggaaattttattctattgtaacagcatcttttttatgtaatacct
gtttaattcttcttattttaagaagaattatgataaaggagcaggaacaggctgaacagt
ttatccccctttatctacaaatattgctcatgaaggtcatctgtagatctagctatcttt
agactacatatagcagggatctcttcaatcctaggagcaataaattttatttcaacaatt
aatatactataggaatttgatcaattatcattattttgagcagttaaattaacagcaatt
ctacttttattatcattacctgttttagctggagctatca-ctatattattaactgatcg
aaatattaatacttcattttttgatccgcaggagggggagaccctatcttatatcaacat
ttattt----------------------------
>Sory
atagtacatccttaaggaaattttatactattgtcacagcattttctttatgtaatacca
atttaactcttttactaataagaagatttatgaaaagggagcaggaacaggatgaaccgt
ctaccccccgctctcatccaatattgcccatgaaggacttctgttgatctggccattttc
agtttacatatagcaggaatttcatctattctaggagctattaattttattacaacagat
aatatacctcaggaatctgaacgaataaccctattttgagcagtaagaatcactgctatt
ctcctcctctttagattacctgtattagcaggagcaatca-ctatacttcttactgatcg
aaatattaatacttccttttttgatccgccggaggag-----------------------
----------------------------------
>Tcas
aaaatacgccattacggcaaggttatacacttgacccagtttttccattaggaaatacca
gtttaaattttggaaattttaacaaaaccgtaaaatgaacaacgcacggcaatccttggc
tt-tggggcgcctccttgtgcgagaattcagcaaaacgcccccaggggcacggaacagcc
gggccccaggtttcacaagttgaaggagatccagaag-----aaatttggggttgaagac
gttccaacttgaaaggtcggcgacaaaattctctacttgactgtattgatcacagctg--
------------------caggtctcggcatgtccattgagtcgtggtaccgattggcca
ataaataaacataacttgttctgtagttactatagttaag-taaattccaacaataaaat
ttattt---------------------aaacaaaCassがスカスカすぎる!
7種昆虫でのヘッダー行再構築
上で書いた6種のやつからmakeOGlist.py、edit_allseq.py、New_Manualphylo_2.py、align.sh、Manualphylo_4.pyをコピーしてきて実行。
makealltree.shはseven_makealltree.shとしてコピー(同じ名前のジョブが並んでややこしくならないように。)
0919
7種ゲノムでヘッダー修正後の系統樹作成
結果は~/tools/for_ASTRAL/Astral/240919_7spに保存した。
tree10= read.tree("/Users/kosukesano/bio/240919_7sp")
p=ggtree(tree10, branch.length = 'none')+
xlim(0,7)+
theme(text = element_text(face = "italic"))+
geom_tiplab(fontface = 4, linesize=3.0) + # Make tip labels italic
geom_nodelab(hjust = -0.2, node = "internal", size = 5) +
#geom_text(aes(label=node), hjust=-.2)+
theme_tree()
p結局変わらなかった
7種でのSuperMatrix法によるIQ-TREEを使った系統樹推定
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/ManualPhylo_dataでmakerun.pyを作成。
### makerun.pyの中身
import glob
import os
list = []
for i in glob.glob('*.maffted.trimed.edit.fa'):
list.append(os.path.split(i)[1].rstrip())
#print(list[0])
##ls | grep "maffted.trimed.edit.fa" > otamesi.txtで、完成したOGをotamesi.txtに一行ずつ保存
##ファイルの行数をカウント。このカウント数がfor文のrangeに入る数になる
f = open("run.nex", "w")
f.write("#nexus" + "\n")
f.write("begin sets;" + "\n")
character = "charset part"
for line, i in zip(list, range(4997)):
row = character + str(i+1) + " = " + line + ": ;"
f.write("\t" + row + "\n")
f.write("end;" + "\n")
f.close()これでrun.nexを作る。
続いてIQ-TREEの実行。使ったシェルスクリプトはmanualphylo.sh
### manualphylo.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
date
singularity exec -e /usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0 iqtree2 -sp run.nex -nt AUTO -bb 1000 -cptime 600
dateこれをqsubで投げた。
7種での欠失を1つ許したASTRAL
~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/Orthogroups下でSCOwiith0tyusyutu.pyを作成、実行した。SCOwiith0tyusyutu.pyは0905の完全コピー。
続いて~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/SCOwithOneZero_Manualphylo_dataでManualphylo_1.pyを実行した。Manualphylo_1.pyは以下の通り。
### Manualphylo_1.pyの中身
##analysis_manual.pptxの#46も参照
##AFTER you made MSA file(all_seq.fa) in DDBJ with makeMSA.sh
##時間は10secほど
import numpy as np
import pandas as pd
import os
path = "~/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir/OrthoFinder/Results_Sep03/"
withpath = "../"
OGs = pd.read_table(path + "Orthogroups/Orthogroups.tsv")
##with openは相対パスしか受け付けないらしい
new = pd.DataFrame()
with open(withpath + "Orthogroups/Orthogroups.GeneCount.SingleCopyWithOneZeroOrtholog.txt", "r") as fin:
for line in fin:
li = line.rstrip()
new = pd.concat([new, OGs[OGs["Orthogroup"] == li]])
print(new)
new.to_csv(path + "SCOwithOneZero_Manualphylo_data/OG_list.txt", sep = " ", index = False, header = False)
##OG_list.txtと同じ順番の種名リストであるspecies_list.txtを作成
##できたOG_list.txtに、DDBJで作ったall_seq.faで配列情報を与える。
li = []
allspe = OGs.columns.tolist()
allspe2 = allspe[1:len(allspe)]
with open(withpath + "SCOwithOneZero_Manualphylo_data/species_list.txt", "w") as file:
for column_name in allspe2:
file.write("%s\n" % column_name)同じディレクトリでManualphylo_2.pyを実行した。結構時間かかる。
### Manualphylo_2.pyの中身
##ManualPhylo_1.pyの続き
import sys
from Bio import SeqIO
path = "../SCOwithOneZero_Manualphylo_data/"
fasta_in = sys.argv[1] #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2] #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する
for q in open(query_in, "r"): #オーソログファイルを開いて1行づつ読み込む
query = q.split() #スペース毎に切りとってリスト形式でqueryに保存する
f = open(path + query[0], 'w') #最初の列(OG名)と同じ名前のファイルを作成する
for record in SeqIO.parse(fasta_in, 'fasta'): #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
id_part = record.id #fastaのID部分を読み込む
desc_part = record.description #fastaのdescription部分を読み込む
seq = record.seq #fastaの配列部分を読み込む
for i in range(len(query)): #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
if id_part == query[i] or desc_part == query[i] : #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
fasta_seq = '>' + id_part + ' ' + desc_part + '\n' + seq + '\n' #fasta形式に整え
print(fasta_seq) #標準出力にfastaを出力(進行状況把握用)
f.write(str(fasta_seq)) #各OGファイルにfastaを出力
f.close()
##できたOGファイルは、align.shやOG_list.txtと同じ場所に
##align.shのある場所までいき、作動。cwdを231016/ManualPhylo_dataにしないとtrimalが作動せず、イライラ実行のコマンドは以下の通り
python ManualPhylo_2.py ../make_philo_tree/all_seq.fa OG_list.txt0930
Elaeidobius kamerunicusのRepeatMasker
~/tools/for_softmask/Ekam_softmaskで以下のスクリプトを作成、実行した。
### Ekam_RepeatMasker.sh
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatMasker -pa 6 -lib\
/home/kosukesano/tools/for_softmask/Ekam_softmask/RM_2331.MonSep230400512024\
/home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna
datePissodes strobiのRepeatMasker
~/tools/for_softmask/Pstr_softmaskで以下のスクリプトを作成、実行した。
### Pstr_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 24
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatMasker -pa 6 -lib\
/home/kosukesano/tools/for_softmask/Pstr_softmask/RM_77685.SunSep222227102024/consensi.fa.classified\
/home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/GCA_016904865.1_GSC_weevil_1.0_genomic.fna
dateIQ-TREE出力の系統樹についてCAFEの前準備
:~/bio/for_cafe$ mkdir 0930_orthofinder_data
:~/bio/for_cafe$ cd 0930_orthofinder_data/
:~/bio/for_cafe/0930_orthofinder_data$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Orthogroups/Orthogroups.GeneCount.tsv ../0930_orthofinder_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Orthogroups.GeneCount.tsv 100% 354KB 4.7MB/s 00:00
:~/bio/for_cafe/0930_orthofinder_data$ ls
Orthogroups.GeneCount.tsv
:~/bio/for_cafe/0930_orthofinder_data$Orthologs_raw <-
read_tsv(paste("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.GeneCount.tsv", sep = "/"))
##Enzanはorthogroupのなかで遺伝子数が変なやつを検出するためのmatrix
Enzan <- Orthologs_raw %>%
select(!c(Orthogroup, Total)) %>%
t()
##saidai, saisyouは各Orthogroupの中で、各種が持っているコピー数の最大値及び最小値を記したdf
saidai <- Enzan %>%
apply(2, max) %>%
as.data.frame() %>%
rename(max_real = ".")
saisyou <- Enzan %>%
apply(2, min) %>%
as.data.frame() %>%
rename(min_real = ".")
##Orthologs_1は各Orthogroupsの最大値、最小値もくっつけたdf
Orthologs_1 <- Orthologs_raw %>% select(!c(Total)) %>%
bind_cols(saidai, saisyou)
##最大値と最小値の差
Orthologs_2 <-Orthologs_1 %>%
mutate(sa = max_real - min_real) %>%
filter(max_real != min_real) %>%
filter(sa < 50)
##外れ値と遺伝子ファミリー数が全種で共通の行を省いた。最後に1列目を複製し列名をいじって、CAFEへのインプットデータの出来上がり。
Orthologs_3 <- Orthologs_2 %>%
mutate(Description = Orthogroup, ID = Orthogroup) %>%
relocate(Description, ID) %>%
select(!c(Orthogroup, max_real, min_real, sa))
Orthologs_3 %>%
write_tsv(paste("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.GeneCount2.tsv", sep = "/"))#, quote = FALSE) #,row.names = FALSE)
##Did you finish creating ultrametric tree with makeultrametric.R?### ~/bio/for_cafe/0930_orthofinder_data/IQTREE_6sp_out.txt
(Agra:0.2063145278,((Cass:0.1854832267,Dpon:0.2312244082)97:0.0164557860,Smad:0.1400667500)100:0.0195008540,(Sory:0.2105736237,Tcas:0.5169602661)100:0.0679653321);tree = read.tree("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/IQTREE_6sp_out.txt")
mrca = getMRCA(tree, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
tree,
100000,
age.min = 152.3, # 推定分岐年代の最小値(MYA)
age.max = 236.2, # 推定分岐年代の最大値(MYA)
node = mrca, # getMRCAで指定したノード
S = 1,
tol = 1e-20,
CV = FALSE,
eval.max = 500,
iter.max = 500
)
is.ultrametric(tree2) # ultrametricかどうか確認
write.tree(tree2, file = "0930_orthofinder_data/tree_IQTREE_ultrametric.nwk") # ultrametric系統樹の保存:~/bio/for_cafe/0930_orthofinder_data$ scp Orthogroups.GeneCount2.tsv kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/6sp_useIQTREE
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Orthogroups.GeneCount2.tsv 100% 400KB 5.9MB/s 00:00
:~/bio/for_cafe/0930_orthofinder_data$ scp tree_IQTREE_ultrametric.nwk kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/6sp_useIQTREE
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
tree_IQTREE_ultrametric.nwk 100% 143 14.8KB/s 00:00
:~/bio/for_cafe/0930_orthofinder_data$IQ-TREE出力の系統樹を使ったCAFE5の実行
kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_IQTREE_ultrametric.nwk
Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_IQTREE_ultrametric.nwk
Filtering families not present at the root from: 12784 to 8360
No root family size distribution specified, using uniform distribution
Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1
.
.
.
.
.
59 values were attempted (0% rejected)
Inferring processes for Base model
Score (-lnL): 115874.33817573
Maximum possible lambda for this topology: 0.0020213044143122
Computing pvalues...
done!
Starting reconstruction processes for Base model
Done!
kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE$ ls
Orthogroups.GeneCount2.tsv results tree_IQTREE_ultrametric.nwk
kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE$ cd results/
kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE/results$ ls
Base_asr.tre Base_branch_probabilities.tab Base_change.tab Base_clade_results.txt Base_count.tab Base_family_likelihoods.txt Base_family_results.txt Base_results.txt
kosukesano@at138:~/tools/for_cafe/6sp_useIQTREE/results$ ### ~/tools/for_cafe/6sp_useIQTREE/results/Base_clade_results.txt
#Taxon_ID Increase Decrease
Agra<8> 2793 925
<7> 1 53
<5> 1 12
Cass<1> 499 3639
Dpon<0> 2982 1374
Smad<4> 886 3180
<6> 861 573
Sory<3> 2191 1232
Tcas<2> 1853 1984ASTRALの再実行
6種のゲノム全てを使ったASTRALを改めて行った。結果は~/tools/for_ASTRAL/Astral/240930_6spに格納。使ったスクリプトは以下の通り。
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 16
echo start at
date
java -Xmx2G -jar astral.5.7.8.jar \
-i /home/kosukesano/tools/for_ASTRAL/Astral/data/240930_6sp/modified_trees.nwk \
-o /home/kosukesano/tools/for_ASTRAL/Astral/240930_6sp/out.tre \
2>/home/kosukesano/tools/for_ASTRAL/Astral/240930_6sp/out.log
dateこの出力は末端枝の長さが不明なので、暫定的に末端枝を1とするスクリプトを用意し実行した。スクリプトのソースはこちら
### ~/tools/for_ASTRAL/Astral/240930_6sp/makelength.pyの中身
#!/usr/bin/env python
'''
Created on Jun 3, 2011
@author: smirarab
'''
import dendropy
import sys
import os
import copy
import os.path
if __name__ == '__main__':
if len(sys.argv) < 3:
print("USAGE: [postfix|-|--] treefile")
sys.exit(1)
stdout = False
if sys.argv[1] == "-":
resultsFile = sys.stdout
stdout = True
elif sys.argv[1] == "--":
postfix = "blen"
else:
postfix = sys.argv[1]
c={}
for treeName in sys.argv[2:]:
if not stdout:
resultsFile=open("%s.%s" % (treeName, postfix),'w')
trees = dendropy.TreeList.get_from_path(treeName, 'newick')
for tree in trees:
for e in tree.postorder_edge_iter():
if not e.length:
e.length = 1
sys.stderr.write("writing results to " + resultsFile.name + "\n")
trees.write(file=resultsFile,schema='newick')これを以下のコマンドで実行。
kosukesano@at138:~/tools/for_ASTRAL/Astral/240930_6sp$ python makelength.py blen out.tre
writing results to out.tre.blen
kosukesano@at138:~/tools/for_ASTRAL/Astral/240930_6sp$これで書きかわったファイルout.tre.blenができた。
このファイルをローカルで/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/ASTRAL_6sp.txtとして保存し、以下のコードを実行した。
tree = read.tree("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/ASTRAL_6sp.txt")
ASTRAL_6sp = root(tree, outgroup = "Tcas")
write.tree(phy=ASTRAL_6sp, file='0930_orthofinder_data/ASTRAL_6sp_after_root_outgroup.txt')
mrca = getMRCA(ASTRAL_6sp, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
tree,
100000,
age.min = 152.3, # 推定分岐年代の最小値(MYA)
age.max = 236.2, # 推定分岐年代の最大値(MYA)
node = mrca, # getMRCAで指定したノード
S = 1,
tol = 1e-20,
CV = FALSE,
eval.max = 500,
iter.max = 500
)
is.ultrametric(tree2) # ultrametricかどうか確認
write.tree(tree2, file = "0930_orthofinder_data/tree_ASTRAL_ultrametric.nwk") # ultrametric系統樹の保存これによってできた系統樹を~/tools/for_cafe/6sp_useASTRALにコピーした。
ASTRAL出力の系統樹を使ったCAFE5の実行
kosukesano@at138:~/tools/for_cafe/6sp_useASTRAL$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i ../6sp_useIQTREE/Orthogroups.GeneCount2.tsv -t tree_ASTRAL_u
ltrametric.nwk
Command line: /usr/local/bin/cafe5 -i ../6sp_useIQTREE/Orthogroups.GeneCount2.tsv -t tree_ASTRAL_ultrametric.nwk
Filtering families not present at the root from: 12784 to 8975
No root family size distribution specified, using uniform distribution
Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1
.
.
.
.
.
Completed 24 iterations
Time: 0H 0M 3S
Best match is: 0.0023045008755977
Final -lnL: 126872.43351184
51 values were attempted (0% rejected)
Inferring processes for Base model
Score (-lnL): 126872.43351184
Maximum possible lambda for this topology: 0.004233700254022
Computing pvalues...
done!
Starting reconstruction processes for Base model
Done!
kosukesano@at138:~/tools/for_cafe/6sp_useASTRAL$ ### ~/tools/for_cafe/6sp_useASTRAL/results/Base_clade_results.txt
#Taxon_ID Increase Decrease
Cass<0> 592 3547
Agra<9> 3187 914
<8> 27 1188
<7> 710 754
Sory<5> 2319 1113
Tcas<4> 1923 1958
<6> 1 194
Dpon<3> 3181 1428
Smad<1> 1013 3123IQ-TREE出力の系統樹を使ったPAMLの実行
~/tools/for_paml/IQTREE_6sp/ディレクトリを作成、その下でdataディレクトリを作った。dataにはツリーファイルを格納した。
kosukesano@at138:~/tools/for_paml/IQTREE_6sp$ ls data
tree_ASTRAL_ultrametric.nwk tree_IQTREE_ultrametric.nwkまた、~/tools/for_paml/IQTREE_6sp/bsAディレクトリを作成。その下でrun_paml.shとtemplate.ctlを作った。
### run_paml.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/6sp/data/SCO_plusname"
bsA_dir="/home/kosukesano/tools/for_paml/IQTREE_6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fasta; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fasta)
outfile_path="$result_dir/${base_name}_branch_alt"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done###template.ctlの中身
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/IQTREE_6sp/data/tree_IQTREE_ultrametric.nwk
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0run_paml.shをqsubで投げた。
続いて帰無仮説の方について、~/tools/for_paml/IQTREE_6sp/bs_nullディレクトリを作成。その下でbsN_run_paml.shとbsN_template.ctlを作った。
ASTRAL出力の系統樹を使ったPAMLの実行
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls
bsA bs_null data rst rst1 rub run_paml.sh.e26903588 run_paml.sh.o26903588
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls bsA/
2NG.dN 2NG.dS 2NG.t 4fold.nuc bsA.ctl lnf result rst rst1 rub run_paml.sh run_paml.sh.e26903683 run_paml.sh.o26903683 template.ctl
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls bs_null/
2NG.dN 2NG.t bsA.ctl bsN_run_paml.sh.e26903612 bsN_run_paml.sh.o26903612 bsN_template.ctl result rst1
2NG.dS 4fold.nuc bsN_run_paml.sh bsN_run_paml.sh.e26903688 bsN_run_paml.sh.o26903688 lnf rst rub
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ 2024年10月
1004
scorpion内でpyenvを立ち上げる。
pyenvのインストール
(base) dendezia@scorpion:~$ git clone https://github.com/yyuu/pyenv.git ~/.pyenv
Cloning into '/home/dendezia/.pyenv'...
remote: Enumerating objects: 25118, done.
remote: Counting objects: 100% (1852/1852), done.
remote: Compressing objects: 100% (127/127), done.
remote: Total 25118 (delta 1772), reused 1732 (delta 1724), pack-reused 23266 (from 1)
Receiving objects: 100% (25118/25118), 5.09 MiB | 9.08 MiB/s, done.
Resolving deltas: 100% (16935/16935), done.
(base) dendezia@scorpion:~$ ls -a
. .. .R .bash_history .bash_logout .bashrc .cache .conda .dotnet .lesshst .profile .pyenv .ssh .vscode-server .wget-hsts .zshrc old_envilonment_until20241004
(base) dendezia@scorpion:~$ ls .pyenv/
CHANGELOG.md CONDUCT.md Dockerfile MAINTENANCE.md README.md completions man pyenv.d terminal_output.png
COMMANDS.md CONTRIBUTING.md LICENSE Makefile bin libexec plugins src test
(base) dendezia@scorpion:~$ pyenv用のプロファイルの作成
(base) dendezia@scorpion:~$ mkdir pyenv_conda_environment
(base) dendezia@scorpion:~$ cd pyenv_conda_environment/
(base) dendezia@scorpion:~/pyenv_conda_environment$ nano .pyenv_profile
(base) dendezia@scorpion:~/pyenv_conda_environment$ ls -a
. .. .pyenv_profile
(base) dendezia@scorpion:~/pyenv_conda_environment$ source .pyenv_profile
(base) dendezia@scorpion:~/pyenv_conda_environment$ pyenv
pyenv 2.4.14-1-g468dc811
Usage: pyenv <command> [<args>]
Some useful pyenv commands are:
--version Display the version of pyenv
commands List all available pyenv commands
exec Run an executable with the selected Python version
global Set or show the global Python version(s)
help Display help for a command
hooks List hook scripts for a given pyenv command
init Configure the shell environment for pyenv
install Install a Python version using python-build
latest Print the latest installed or known version with the given prefix
local Set or show the local application-specific Python version(s)
prefix Display prefixes for Python versions
rehash Rehash pyenv shims (run this after installing executables)
root Display the root directory where versions and shims are kept
shell Set or show the shell-specific Python version
shims List existing pyenv shims
uninstall Uninstall Python versions
version Show the current Python version(s) and its origin
version-file Detect the file that sets the current pyenv version
version-name Show the current Python version
version-origin Explain how the current Python version is set
versions List all Python versions available to pyenv
whence List all Python versions that contain the given executable
which Display the full path to an executable
See `pyenv help <command>' for information on a specific command.
For full documentation, see: https://github.com/pyenv/pyenv#readme
(base) dendezia@scorpion:~/pyenv_conda_environment$ pyenvを用いたAnaconda3のインストール
### condainstall.shの中身
#$ -S /bin/bash
#$ -cwd
date
echo starting at date
source ~/pyenv_conda_environment/.pyenv_profile
pyenv install anaconda3-2020.11
dateこれをqsubで投げた
結果
~/.pyenv/versions/にanaconda3-2020.11ディレクトリができた。
(base) dendezia@scorpion:~/pyenv_conda_environment$ cd ~/.pyenv/versions/
(base) dendezia@scorpion:~/.pyenv/versions$ ls
anaconda3-2020.11
(base) dendezia@scorpion:~/.pyenv/versions$ ls anaconda3-2020.11/
LICENSE.txt compiler_compat condabin envs include lib man phrasebooks plugins resources share ssl var
bin conda-meta doc etc info libexec mkspecs pkgs qml sbin shell translations x86_64-conda_cos6-linux-gnu
(base) dendezia@scorpion:~/.pyenv/versions$BRAKER3用のAnaconda環境であるbrakerを作成
base) dendezia@scorpion:~/tool/pyenv_env$ conda create -n braker python=3.9
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 4.9.2
latest version: 24.9.1
Please update conda by running
.
.
.braker環境用のプロファイル、braker_profileを~/tool/pyenv_envの下に作成。
### braker_profileの中身
source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global anaconda3-2020.11
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/dendezia/.pyenv/versions/anaconda3-2020.11/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/home/dendezia/.pyenv/versions/anaconda3-2020.11/etc/profile.d/conda.sh" ]; then
. "/home/dendezia/.pyenv/versions/anaconda3-2020.11/etc/profile.d/conda.sh"
else
export PATH="/home/dendezia/.pyenv/versions/anaconda3-2020.11/bin:$PATH"
fi
fi
unset __conda_setup
# <<< conda initialize <<<
conda activate brakerまた、scorpion内に.bash_profileがなかったのでそれも作成。
### .bash_profileの中身
# .bash_profile
# Get the aliases and functions
if [ -f ~/.bashrc ]; then
. ~/.bashrc
fi
# User specific environment and startup programs
PATH=$PATH:$HOME/.local/bin:$HOME/bin
export PATHここまでやってbraker_profileをsourceするとbraker環境に入れる。
ただ。現状ではまだBRAKERは入っておらず、空の環境があるだけ。
1007
IQ-TREEを使ったCAFE5の結果の処理
ローカルの~/bioにCAFE5のresultのディレクトリをコピーした。
:~/bio/for_cafe/241007_cafe_original_data$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/6sp_useIQTREE/results ~/bio/for_cafe/241007_cafe_original_data/useIQTREE
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Base_clade_results.txt 100% 163 5.0KB/s 00:00
Base_asr.tre 100% 1422KB 10.7MB/s 00:00
Base_count.tab 100% 245KB 2.6MB/s 00:00
Base_results.txt 100% 163 5.1KB/s 00:00
Base_family_likelihoods.txt 100% 154KB 1.2MB/s 00:00
Base_family_results.txt 100% 146KB 2.3MB/s 00:00
Base_branch_probabilities.tab 100% 72KB 1.9MB/s 00:00
Base_change.tab 100% 327KB 5.1MB/s 00:00
:~/bio/for_cafe/241007_cafe_original_data$ ls
useIQTREE同じくOrthofinderの出力もコピーした。
:~/bio/for_cafe/0930_orthofinder_data$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Orthogroups/Orthogroups.tsv ~/bio/for_cafe/0930_orthofinder_data/
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Orthogroups.tsv 100% 2350KB 15.7MB/s 00:00
:~/bio/for_cafe/0930_orthofinder_data$ ls
ASTRAL_6sp.txt IQTREE_6sp_out.txt Orthogroups.GeneCount2.tsv tree_ASTRAL_ultrametric.nwk
ASTRAL_6sp_after_root_outgroup.txt Orthogroups.GeneCount.tsv Orthogroups.tsv tree_IQTREE_ultrametric.nwk
:~/bio/for_cafe/0930_orthofinder_data$ これを元に以下のスクリプトを実行した。
library(tidyverse)
Deg<-read.csv("Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")
Plami<-read.csv("241007_cafe_original_data/useIQTREE/Base_change.tab", sep="\t")
View(Plami)
# ファイルを読み込む
file_path <- "241007_cafe_original_data/useIQTREE/Base_asr.tre"
lines <- readLines(file_path)# linesはCAFEが推定した各遺伝子ファミリーの系統樹。有意な増減があったところには*がふられている。
print(lines)
# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", lines))
trees_end <- which(grepl("END;", lines))
trees_lines <- lines[(trees_start + 1):(trees_end - 1)]
print(trees_lines)
# 不要なスペースを削除
trees_lines <- gsub("^\\s+|\\s+$", "", trees_lines)
# データフレームに変換
library(tibble)
trees_df <- tibble(Tree = trees_lines)
ex=trees_df|>###マダラで優位に増減したOGのOG番号を抽出したファイル
#lines|>
tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>#系統樹の文字列をOG番号の列とツリーの列に分割
dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>#OG番号の列の余計な文字を除去
dplyr::mutate(tree = stringr::str_extract(tree, "Smad<4>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)のみを抽出
dplyr::mutate(tree = stringr::str_replace(tree, "Smad<4>\\*_", "significant")) |>#わかりやすいようにsignificantに変更
dplyr::filter(tree == "significant") |>#significantのみを抽出
print()
View(ex)
#################################################################
Plami2=Plami |>###マダラで増加した0Gの0G番号を抽出したファイル
dplyr::select("FamilyID","Smad.4.") |>#OG番号の列とマダラでの遺伝子数の増減量が書いてある列のみを抽出
dplyr::mutate(Smad.4. = stringr::str_extract(Smad.4., r"(^\d+)")) |>#マダラの遺伝子量増減の列のうち、数字のみのもの(-がついておらず、遺伝子数が増加しているもの)を抽出
tidyr::drop_na()|>
dplyr::filter(Smad.4. != 0) |>#遺伝子数の増加分が0のものを除去
print()
View(Plami2)
#################################################################
df=dplyr::inner_join(Plami2, ex, by = c(FamilyID = "OG_num"))|>###マダラで優位に増加したOGのOG番号を抽出したファイル
print()
##################################################################
# ファイルパスの指定
orthogroups_file <- "0930_orthofinder_data/Orthogroups.tsv"
# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim(orthogroups_file, header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")
# データの最初の数行を表示して確認
head(orthogroups)
View(orthogroups)
df2=dplyr::left_join(df, orthogroups, by = c(FamilyID = "V1"))|>#マダラで有意に増加したOGのOG番号とマダラの遺伝子IDを紐付ける
dplyr::select(!c(Smad.4., tree)) |>
print()
View(df2)
################################################################
# V5列の遺伝子IDをカンマで区切って、新しいデータフレームを作成
df_expanded <- df2 %>%###マダラでのみ増加した遺伝子のgene_IDとOG番号
separate_rows(V5, sep = ", ") %>%
rename(gene_ID = V5, family_ID = FamilyID)|>
dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) |>
print()
###############################################################
### CAFE5でマダラでのみ増加した遺伝子とその機能のファイル、df3
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
View(fa)
df3=dplyr::left_join(df_expanded, fa, by = c(gene_ID = "Madara"))|>###完成系
print()
View(df3)結果として、104個の遺伝子ファミリー、584個の遺伝子が、有意に増加した遺伝子として検出された。
ASTRALを使ったCAFE5の結果の処理
ローカルの~/bioに同じくCAFE5のresultディレクトリをコピーした。
:~/bio/for_cafe/0930_orthofinder_data$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/6sp_useASTRAL/results ~/bio/for_cafe/241007_cafe_original_data/useASTRAL
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Base_clade_results.txt 100% 168 5.0KB/s 00:00
Base_asr.tre 100% 1702KB 10.7MB/s 00:00
Base_count.tab 100% 281KB 3.8MB/s 00:00
Base_results.txt 100% 162 4.6KB/s 00:00
Base_family_likelihoods.txt 100% 165KB 2.5MB/s 00:00
Base_family_results.txt 100% 157KB 2.3MB/s 00:00
Base_branch_probabilities.tab 100% 73KB 937.3KB/s 00:00
Base_change.tab 100% 377KB 4.8MB/s 00:00
:~/bio/for_cafe/0930_orthofinder_data$ これを元に以下のスクリプトを実行した。
################################################################################
### ASTRALを使用
A_df1=read.csv("241007_cafe_original_data/useASTRAL/Base_change.tab", sep="\t")
print(A_df1)
A_lines=readLines("241007_cafe_original_data/useASTRAL/Base_asr.tre")
# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", A_lines))
trees_end <- which(grepl("END;", A_lines))
trees_lines <- lines[(trees_start + 1):(trees_end - 1)]
trees_lines <- gsub("^\\s+|\\s+$", "", trees_lines)
print(trees_lines)
# データフレームに変換
trees_df <- tibble(Tree = trees_lines)
ex=trees_df|>###マダラで優位に増減したOGのOG番号を抽出したファイル
#lines|>
tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>#系統樹の文字列をOG番号の列とツリーの列に分割
dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>#OG番号の列の余計な文字を除去
dplyr::mutate(tree = stringr::str_extract(tree, "Smad<4>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)のみを抽出
dplyr::mutate(tree = stringr::str_replace(tree, "Smad<4>\\*_", "significant")) |>#わかりやすいようにsignificantに変更
dplyr::filter(tree == "significant") |>#significantのみを抽出
print()
View(ex)
#################################################################
A_df2=A_df1 |>###マダラで増加した0Gの0G番号を抽出したファイル
dplyr::select("FamilyID","Smad.1.") |>#OG番号の列とマダラでの遺伝子数の増減量が書いてある列のみを抽出
dplyr::mutate(Smad.1. = stringr::str_extract(Smad.1., r"(^\d+)")) |>#マダラの遺伝子量増減の列のうち、数字のみのもの(-がついておらず、遺伝子数が増加しているもの)を抽出
tidyr::drop_na()|>
dplyr::filter(Smad.1. != 0) |>#遺伝子数の増加分が0のものを除去
print()
View(Plami2)
#################################################################
A_df3=dplyr::inner_join(A_df2, ex, by = c(FamilyID = "OG_num"))|>###マダラで優位に増加したOGのOG番号を抽出したファイル
print()
##################################################################
# ファイルパスの指定
orthogroups_file <- "0930_orthofinder_data/Orthogroups.tsv"
# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim(orthogroups_file, header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")
# データの最初の数行を表示して確認
head(orthogroups)
View(orthogroups)
A_df4=dplyr::left_join(A_df3, orthogroups, by = c(FamilyID = "V1"))|>#マダラで有意に増加したOGのOG番号とマダラの遺伝子IDを紐付ける
dplyr::select(!c(Smad.1., tree)) |>
print()
View(A_df4)
################################################################
# V5列の遺伝子IDをカンマで区切って、新しいデータフレームを作成
A_df5 <- A_df4 %>%###マダラでのみ増加した遺伝子のgene_IDとOG番号
separate_rows(V5, sep = ", ") %>%
rename(gene_ID = V5, family_ID = FamilyID)|>
dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", "")) |>
print()
###############################################################
### CAFE5でマダラでのみ増加した遺伝子とその機能のファイル、df3
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
View(fa)
A_df6=dplyr::left_join(A_df5, fa, by = c(gene_ID = "Madara"))|>###完成系
print()
View(A_df6)
################################################################まとめ
IQTREE系統樹を使ったCAFE5にて、マダラで有意に増加した遺伝子ファミリーに含まれる遺伝子
# A tibble: 584 × 10
family_ID gene_ID Ecoli Ecol_GeneFunction Dmelanogaster Dmel_GeneFunction
<chr> <chr> <chr> <chr> <chr> <chr>
1 OG0000006 g10906.t1 "" "" "" ""
2 OG0000006 g11758.t1 "" "" "" ""
3 OG0000006 g12212.t1 "" "" "" ""
4 OG0000006 g12251.t1 "" "" "" ""
5 OG0000006 g12781.t1 "" "" "" ""
6 OG0000006 g13547.t1 "" "" "" ""
7 OG0000006 g1616.t1 "" "" "" ""
8 OG0000006 g2495.t1 "" "" "" ""
9 OG0000006 g3400.t1 "" "" "" ""
10 OG0000006 g3400.t2 "" "" "" ""
# ℹ 574 more rows
# ℹ 4 more variables: Tcastaneum <chr>, Tcas_GeneFunction <chr>, Soryzae <chr>,
# Sory_GeneFunction <chr>
ASTRAL系統樹を使ったCAFE5にて、マダラで有意に増加した遺伝子ファミリーに含まれる遺伝子
# A tibble: 584 × 10
family_ID gene_ID Ecoli Ecol_GeneFunction Dmelanogaster Dmel_GeneFunction
<chr> <chr> <chr> <chr> <chr> <chr>
1 OG0000006 g10906.t1 "" "" "" ""
2 OG0000006 g11758.t1 "" "" "" ""
3 OG0000006 g12212.t1 "" "" "" ""
4 OG0000006 g12251.t1 "" "" "" ""
5 OG0000006 g12781.t1 "" "" "" ""
6 OG0000006 g13547.t1 "" "" "" ""
7 OG0000006 g1616.t1 "" "" "" ""
8 OG0000006 g2495.t1 "" "" "" ""
9 OG0000006 g3400.t1 "" "" "" ""
10 OG0000006 g3400.t2 "" "" "" ""
# ℹ 574 more rows
# ℹ 4 more variables: Tcastaneum <chr>, Tcas_GeneFunction <chr>, Soryzae <chr>,
# Sory_GeneFunction <chr>
CAFE5全体での結果は以下の通り
### useIQTREE/Base_clade_results.txt
#Taxon_ID Increase Decrease
Agra<8> 2793 925
<7> 1 53
<5> 1 12
Cass<1> 499 3639
Dpon<0> 2982 1374
Smad<4> 886 3180
<6> 861 573
Sory<3> 2191 1232
Tcas<2> 1853 1984### useASTRAL/Base_clade_results.txt
#Taxon_ID Increase Decrease
Cass<0> 592 3547
Agra<9> 3187 914
<8> 27 1188
<7> 710 754
Sory<5> 2319 1113
Tcas<4> 1923 1958
<6> 1 194
Dpon<3> 3181 1428
Smad<1> 1013 3123牧野研wiki用のパスワード生成
:~$ echo -n '(ここにパスワードを入れる)' | shasum -a 256scorpion内にmambaforge/EDTA環境を立ち上げる
dendezia@scorpion:~/pyenv_conda_environment$ pyenv install mambaforge-22.9.0-3
Downloading Mambaforge-22.9.0-3-Linux-x86_64.sh.sh...
-> https://github.com/conda-forge/miniforge/releases/download/22.9.0-3/Mambaforge-22.9.0-3-Linux-x86_64.sh
Installing Mambaforge-22.9.0-3-Linux-x86_64.sh...
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 22.9.0
latest version: 24.9.1
Please update conda by running
$ conda update -n base -c conda-forge conda
## Package Plan ##
environment location: /home/dendezia/.pyenv/versions/mambaforge-22.9.0-3
added / updated specs:
- conda=22.9.0
- pip
The following packages will be downloaded:
package | build
---------------------------|-----------------
ca-certificates-2024.8.30 | hbcca054_0 155 KB conda-forge
certifi-2024.8.30 | pyhd8ed1ab_0 160 KB conda-forge
pip-24.2 | pyh8b19718_1 1.2 MB conda-forge
------------------------------------------------------------
Total: 1.5 MB
The following packages will be UPDATED:
ca-certificates 2022.12.7-ha878542_0 --> 2024.8.30-hbcca054_0 None
certifi 2022.12.7-pyhd8ed1ab_0 --> 2024.8.30-pyhd8ed1ab_0 None
pip 22.3.1-pyhd8ed1ab_0 --> 24.2-pyh8b19718_1 None
Downloading and Extracting Packages
ca-certificates-2024 | 155 KB | ############################################################################################################################################ | 100%
pip-24.2 | 1.2 MB | ############################################################################################################################################ | 100%
certifi-2024.8.30 | 160 KB | ############################################################################################################################################ | 100%
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Retrieving notices: ...working... done
Installed Mambaforge-22.9.0-3-Linux-x86_64.sh to /home/dendezia/.pyenv/versions/mambaforge-22.9.0-3
dendezia@scorpion:~/pyenv_conda_environment$ これでmambaforgeのインストールは完了。
次に~/tool/pyenv_envでEDTA_profileを作成する。EDTA_profileの中身は以下の通り。
### EDTA_profileの中身
source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
. "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
else
export PATH="/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
fi
fi
unset __conda_setup
if [ -f "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
. "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<
conda activate EDTA2これをそのままsourceするとmambaforgeの環境には入れるが、EDTAの環境が無いためbaseに入る。
その状態で下のコードを実行する。
(base) dendezia@scorpion:~/tool/pyenv_env$ git clone https://github.com/oushujun/EDTA.git
Cloning into 'EDTA'...
remote: Enumerating objects: 4879, done.
remote: Counting objects: 100% (626/626), done.
remote: Compressing objects: 100% (168/168), done.
remote: Total 4879 (delta 479), reused 580 (delta 455), pack-reused 4253 (from 1)
Receiving objects: 100% (4879/4879), 232.57 MiB | 17.86 MiB/s, done.
Resolving deltas: 100% (2769/2769), done.
Updating files: 100% (222/222), done.
(base) dendezia@scorpion:~/tool/pyenv_env$ ls
EDTA EDTA_profile braker_profileEDTAのgitにはmambaのイメージが置いてあるので、それを利用する。
EDTAのディレクトリに入り、以下のコマンドを実行する。
mamba env create -f EDTA_2.2.x.ymlこれによりEDTA2というmambaの環境が立ち上がる。あとはEDTA_profileをsourceすればEDTAの環境に入れる。
(base) dendezia@scorpion:~/tool/pyenv_env$ source EDTA_profile
(EDTA2) dendezia@scorpion:~/tool/pyenv_env$ scorpionでのEcanのソフトマスク
scorpionにECanのゲノムデータを転送。
:~/Downloads$ scp Ekam_ncbi_dataset.zip dendezia@scorpion:/home/dendezia/tool/for_softmask/nama_data
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
| .+. .=o=+.|
| o*.o.=.*+|
| oo.*oo B.o|
| ..o= +.* ..|
| o .+S o * . |
| . o. . E |
| ....o |
| oo+ |
| o= |
+----[SHA256]-----+
Ekam_ncbi_dataset.zip 100% 274MB 98.6MB/s 00:02
:~/Downloads$ (EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_ncbi_dataset.zip
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ unzip Ekam_ncbi_dataset.zip
Archive: Ekam_ncbi_dataset.zip
inflating: README.md
inflating: ncbi_dataset/data/data_summary.tsv
inflating: ncbi_dataset/data/assembly_data_report.jsonl
inflating: ncbi_dataset/data/GCA_014849505.1/genomic.gbff
inflating: ncbi_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna
inflating: ncbi_dataset/data/GCA_014849505.1/sequence_report.jsonl
inflating: ncbi_dataset/data/dataset_catalog.json
inflating: md5sum.txt
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_ncbi_dataset.zip README.md md5sum.txt ncbi_dataset
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls ncbi_dataset/
data
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls data
ls: 'data' にアクセスできません: そのようなファイルやディレクトリはありません
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls ncbi_dataset/data/
GCA_014849505.1 assembly_data_report.jsonl data_summary.tsv dataset_catalog.json
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls ncbi_dataset/data/GCA_014849505.1/
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna genomic.gbff sequence_report.jsonl
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ mv ncbi_dataset/ Ekam_dataset
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_dataset Ekam_ncbi_dataset.zip README.md md5sum.txt
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ mv Ekam_ncbi_dataset.zip Ekam_dataset/
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_dataset README.md md5sum.txt
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ mv README.md Ekam_dataset/
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ mv md5sum.txt Ekam_dataset/
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ ls
Ekam_dataset
(EDTA2) dendezia@scorpion:~/tool/for_softmask/nama_data$ まずEkamのデータベースを作成する。
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ ls ../nama_data/Ekam_dataset/data/GCA_014849505.1/
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna genomic.gbff sequence_report.jsonl
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ BuildDatabase -name Ekam_BLAST_DATABASE ../nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna
Building database Ekam_BLAST_DATABASE:
Reading ../nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna...
Number of sequences (bp) added to database: 364527 ( 269635327 bp )
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ ls
Ekam_BLAST_DATABASE.nhr Ekam_BLAST_DATABASE.njs Ekam_BLAST_DATABASE.nni Ekam_BLAST_DATABASE.nsq
Ekam_BLAST_DATABASE.nin Ekam_BLAST_DATABASE.nnd Ekam_BLAST_DATABASE.nog Ekam_BLAST_DATABASE.translation
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ EkamのRepeatModeler
~/tool/for_softmask/Ekam_softmaskで以下のスクリプトを書き、qsubで実行した。
### Ekam_RepeatModeler.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
source /home/dendezia/tool/pyenv_env/EDTA_profile
RepeatModeler -database /home/dendezia/tool/for_softmask/Ekam_softmask/Ekam_BLAST_DATABASE -pa 6
date絶対パスじゃないとエラーが出るので注意!
IQTREEとASTRALの出力ファイルの先頭にツール名を付ける
import pandas as pd
# ファイルの読み込み
file_path = "Base_change.tab"
df = pd.read_csv(file_path, sep="\t")
# FamilyID列に「ASTRAL_」を追加
df['FamilyID'] = "ASTRAL_" + df['FamilyID'].astype(str)
# 結果のファイルを保存
output_file = "Base_change_with_astral.tab"
df.to_csv(output_file, sep="\t", index=False)
print(f"変換されたファイルが {output_file} に保存されました。")1008
:~/bio/for_cafe$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe ~/bio/for_cafe/241008_original_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
addtreetool.py 100% 423 22.8KB/s 00:00
Base_clade_results.txt 100% 168 6.8KB/s 00:00
Base_asr.tre 100% 1702KB 10.2MB/s 00:00
Base_count.tab 100% 281KB 3.4MB/s 00:00
Base_change_with_astral.tab 100% 356KB 7.0MB/s 00:00
Base_results.txt 100% 162 5.4KB/s 00:00
Base_family_likelihoods.txt 100% 165KB 2.1MB/s 00:00
Base_family_results.txt 100% 157KB 3.1MB/s 00:00
Base_branch_probabilities.tab 100% 73KB 1.4MB/s 00:00
Base_change.tab 100% 377KB 7.6MB/s 00:00
tree_ASTRAL_ultrametric.nwk 100% 162 5.9KB/s 00:00
Orthogroups.GeneCount2.tsv 100% 400KB 3.6MB/s 00:00
Base_clade_results.txt 100% 163 5.8KB/s 00:00
Base_asr.tre 100% 1422KB 8.0MB/s 00:00
Base_count.tab 100% 245KB 3.2MB/s 00:00
Base_results.txt 100% 163 5.5KB/s 00:00
Base_family_likelihoods.txt 100% 154KB 2.1MB/s 00:00
Base_family_results.txt 100% 146KB 1.9MB/s 00:00
Base_branch_probabilities.tab 100% 73KB 2.1MB/s 00:00
Base_change.tab 100% 327KB 4.0MB/s 00:00
Base_clade_results.txt 100% 247 8.5KB/s 00:00
Base_asr.tre 100% 1485KB 9.3MB/s 00:00
Base_count.tab 100% 252KB 3.2MB/s 00:00
Base_results.txt 100% 161 5.2KB/s 00:00
yuui.py 100% 1861 59.5KB/s 00:00
subete_yuui.py 100% 2358 37.1KB/s 00:00
Base_family_likelihoods.txt 100% 149KB 2.9MB/s 00:00
Base_family_results.txt 100% 140KB 1.9MB/s 00:00
Base_branch_probabilities.tab 100% 60KB 971.9KB/s 00:00
Base_change.tab 100% 338KB 3.0MB/s 00:00
Tcas_yuui.txt 100% 1395 47.0KB/s 00:00
old_tree_ultrametric.nwk 100% 178 5.8KB/s 00:00
tree_ultrametric.nwk 100% 143 4.5KB/s 00:00
Orthogroups.GeneCount2.tsv 100% 400KB 4.9MB/s 00:00
addtreetool.py 100% 423 22.9KB/s 00:00
Base_clade_results.txt 100% 163 5.0KB/s 00:00
Base_asr.tre 100% 1422KB 8.8MB/s 00:00
Base_count.tab 100% 245KB 3.0MB/s 00:00
Base_results.txt 100% 163 6.3KB/s 00:00
Base_family_likelihoods.txt 100% 154KB 2.2MB/s 00:00
Base_family_results.txt 100% 146KB 2.6MB/s 00:00
Base_change_with_IQTREE.tab 100% 315KB 6.3MB/s 00:00
Base_branch_probabilities.tab 100% 72KB 2.0MB/s 00:00
Base_change.tab 100% 327KB 6.5MB/s 00:00
tree_IQTREE_ultrametric.nwk 100% 143 2.6KB/s 00:00
:~/bio/for_cafe$ ls
0930_orthofinder_data Deg Rplot01.png caferesult_6sp_iqtree.png out_madara_SP.txt
241007_cafe_original_data ManualPhylo_1.py Rplot02.png cafe後処理.R tree_IQTREE_ultrametric.nwk
241008_original_data ManualPhylo_2.py ThroughoutCAFE.R cleaned_orthogroups.tsv tree_ultrametric.nwk
ASTRAL_6sp_after_root_outgroup.txt ManualPhylo_3.py branch_site_lrt_results.txt for_cafe.Rproj
CAFE_plus_gene.csv Original_data bs_positive_gene.csv for_sinkagakkai.png
DEG_CAFE_adult_vs_larva.csv Processed_data caferesult.R ogfil.py
DEG_CAFE_ovary_vs_body.csv Rplot.png caferesult_6sp.png old_result
:~/bio/for_cafe$ ls 241008_original_data/
6sp_useASTRAL 6sp_useIQTREE madara_4weevil_Tcas_cafetest
:~/bio/for_cafe$ ASTRALを使ったPAMLの続き、尤度比検定
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp/bsA$ grep "lnL" result/OG00*_maffted_fixed_branch_alt
result/OG0008033_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -6780.075815 +0.000000
result/OG0008036_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -3759.435049 +0.000000
result/OG0008044_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -3513.322057 +0.000000
result/OG0008046_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -2111.234962 +0.000000
result/OG0008048_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -6391.383350 +0.000000
result/OG0008055_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -7082.109637 +0.000000
result/OG0008058_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -6357.858837 +0.000000
result/OG0008060_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -8190.191741 +0.000000
result/OG0008065_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -3881.207146 +0.000000
result/OG0008070_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -8519.310868 +0.000000
result/OG0008071_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -11407.320280 +0.000000
result/OG0008075_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -7685.699268 +0.000000
result/OG0008095_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -5111.973555 +0.000000
result/OG0008097_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -2546.272979 +0.000000
result/OG0008099_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -7363.406172 +0.000000
result/OG0008101_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -46936.546613 +0.000000
result/OG0008106_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -8907.560611 +0.000000
result/OG0008110_maffted_fixed_branch_alt:lnL(ntime: 10 np: 15): -11486.635559 +0.000000
.
.
.
.
.kosukesano@at138:~/tools/for_paml/ASTRAL_6sp/bs_null$ grep "lnL" result/OG00*_maffted_fixed_branch_alt_null
result/OG0008033_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -6780.105180 +0.000000
result/OG0008036_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -3759.435049 +0.000000
result/OG0008044_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -3513.334051 +0.000000
result/OG0008046_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -2111.234961 +0.000000
result/OG0008048_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -6392.377523 +0.000000
result/OG0008055_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -7082.372261 +0.000000
result/OG0008058_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -6357.858837 +0.000000
result/OG0008060_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -8190.325473 +0.000000
result/OG0008065_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -3881.207146 +0.000000
result/OG0008070_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -8519.419180 +0.000000
result/OG0008071_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -11407.055286 +0.000000
result/OG0008075_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -7685.699268 +0.000000
result/OG0008095_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -5111.973555 +0.000000
result/OG0008097_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -2546.802003 +0.000000
result/OG0008099_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -7363.406172 +0.000000
result/OG0008101_maffted_fixed_branch_alt_null:lnL(ntime: 10 np: 14): -46936.546616 +0.000000
.
.
.
.
.取れてないOGもあるっぽいけど、基本はちゃんとできてそう。
尤度比検定用のPythonスクリプトbs_lrp.pyを作成、実行した。
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ nano bs_lrp.py
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ python bs_lrp.py
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls
branch_site_lrt_results.txt bsA bs_lrp.py bs_null data rst rst1 rub run_paml.sh.e26903588 run_paml.sh.o26903588
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ less branch_site_lrt_results.txt
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ mv branch_site_lrt_results.txt ASTRAL_branch_site_lrt_results.txt
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ ls
ASTRAL_branch_site_lrt_results.txt bsA bs_lrp.py bs_null data rst rst1 rub run_paml.sh.e26903588 run_paml.sh.o26903588
kosukesano@at138:~/tools/for_paml/ASTRAL_6sp$ bs_lrp.pyの中身は以下の通り。
### bs_lrp.pyの中身
###~/tools/for_paml/6sp/bs_lrp.pyの中身
import os
import re
from scipy.stats import chi2
def parse_lnL(file_path):
with open(file_path, 'r') as f:
for line in f:
match = re.search(r'lnL\(ntime: \d+ np: (\d+)\):\s+(-?\d+\.\d+)', line)
if match:
np = int(match.group(1))
lnL = float(match.group(2))
return np, lnL
return None, None
def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
lr_stat = 2 * (alt_lnL - null_lnL)
df = alt_np - null_np
p_val = chi2.sf(lr_stat, df)
return p_val
def main():
alt_dir = '~/tools/for_paml/ASTRAL_6sp/bsA/result'
null_dir = '~/tools/for_paml/ASTRAL_6sp/bs_null/result'
output_file = 'branch_site_lrt_results.txt'
alt_dir = os.path.expanduser(alt_dir)
null_dir = os.path.expanduser(null_dir)
og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_branch_alt' in f]
with open(output_file, 'w') as out_f:
out_f.write('OG_num\tp_val\tpositive_selection\n')
for og_file in og_files:
og_num = og_file.split('_')[0]
alt_file = os.path.join(alt_dir, og_file)
null_file = os.path.join(null_dir, og_file.replace('_maffted_fixed_branch_alt', '_maffted_fixed_branch_alt_null'))
if os.path.exists(null_file):
alt_np, alt_lnL = parse_lnL(alt_file)
null_np, null_lnL = parse_lnL(null_file)
if alt_np is not None and null_np is not None:
p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
reject_null = '+' if p_val < 0.05 else '-'
out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')
if __name__ == "__main__":
main()これをローカルに送った。ローカルでは~/bio/for_paml/241008を作りそこに格納。
:~/bio/for_paml$ mkdir 241008
:~/bio/for_paml$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano//tools/for_paml/ASTRAL_6sp/ASTRAL_branch_site_lrt_results.txt 241008
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
ASTRAL_branch_site_lrt_results.txt 100% 7551 218.5KB/s 00:00
:~/bio/for_paml$ ls 241008/
ASTRAL_branch_site_lrt_results.txtまた、FDRの検定を行った。
:~/bio/for_paml$ source paml_hosei/bin/activate
(paml_hosei) :~/bio/for_paml$ xd 241008/
bash: xd: command not found
(paml_hosei) :~/bio/for_paml$ ls
241008 branch_site_lrt_results.txt hosei.py hosei_branch_site_lrt_results.txt paml_hosei
(paml_hosei) :~/bio/for_paml$ cd 241008/
(paml_hosei) :~/bio/for_paml/241008$ ls
ASTRAL_branch_site_lrt_results.txt hosei.py
(paml_hosei) :~/bio/for_paml/241008$ python hosei.py
補正後の結果がhosei_ASTRAL_branch_site_lrt_results.txtに保存されました。検定後のデータは以下の通り。
FDR=read.csv("/Users/kosukesano/bio/for_paml/241008/hosei_ASTRAL_branch_site_lrt_results.txt", sep="\t")|>
dplyr::filter(significant == "True")
orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"
# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim(orthogroups_file, header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")
FDR_2=dplyr::left_join(FDR, orthogroups, by = c(OG_num = "V1"))|>
rename(gene_ID = V5)|>
dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", ""))
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
FDR_3=dplyr::left_join(FDR_2, fa, by = c(gene_ID = "Madara"))|>
print() OG_num p_val positive_selection q_val significant gene_ID
1 OG0009606 1.504944e-04 + 1.635372e-02 True g12267.t1
2 OG0008249 5.519799e-04 + 4.498636e-02 True g9681.t1
3 OG0008782 3.281172e-18 + 1.069662e-15 True g9945.t1
4 OG0008142 3.323308e-05 + 5.416991e-03 True g4236.t1
Ecoli Ecol_GeneFunction Dmelanogaster
1 Dmel_NP_001163062.1
2 Dmel_NP_476803.1
3 Dmel_NP_476617.1
4 Dmel_NP_610687.1
Dmel_GeneFunction Tcastaneum
1 heparan sulfate C5-epimerase, isoform B Tcas_XP_015839037.1
2 scute Tcas_NP_001034533.1
3 laminin A Tcas_XP_008190900.1
4 enigma Tcas_XP_008190394.1
Tcas_GeneFunction
1 PREDICTED: D-glucuronyl C5-epimerase
2 asense
3 PREDICTED: laminin subunit alpha
4 PREDICTED: acyl-CoA dehydrogenase family member 9, mitochondrial
Soryzae Sory_GeneFunction
1 Sory_XP_030766642.1 D-glucuronyl C5-epimerase B
2 Sory_XP_030750625.1 uncharacterized protein LOC115878311
3 Sory_XP_030765960.1 laminin subunit alpha
4 Sory_XP_030752597.1 acyl-CoA dehydrogenase family member 9, mitochondrial
IQ-TREEを使ったPAMLの続き、尤度比検定
bs_nullの方の出力ファイルが_maffted_fixed_branch_altだけだったので、末尾に_nullを追加するスクリプトplus_null.pyを作成、実行した。
### plus_null.pyの中身
import os
# ファイルが格納されているディレクトリのパス
directory = '~/tools/for_paml/IQTREE_6sp/bs_null/result'
# 実際のパスに変換
directory = os.path.expanduser(directory)
# ディレクトリ内のファイルを一括で変更
for filename in os.listdir(directory):
if '_maffted_fixed_branch_alt' in filename:
# 新しいファイル名を作成
new_filename = filename.replace('_maffted_fixed_branch_alt', '_maffted_fixed_branch_alt_null')
# ファイル名の変更
os.rename(os.path.join(directory, filename), os.path.join(directory, new_filename))
print("ファイル名の変更が完了しました。")これをやったのち、bs_lrp.pyを作成して実行。
なぜか出力ファイルに書き込まれない
データがSCOではない?
ちゃんとSCOが取れていない。
### 6sp/data/SCO_plusname/OG0010059_maffted_fixed.fastaの中身
>Cass
atggagaacttagcaaagccccaaataatttgccacaatcaaaaatccttagattacgct
attcacgacgtcaaatggattccttgctccgcaaaatttgtagctataggaggcaaatct
aacggtgcaggtattgtggaaacttatcagctatctgcagatggcatagaaaaactagac
gaattttgcaaaaaggatcacttcaaatgttgcacttttgaagcgtcgagtttgaggaac
aggcatttggcgactggagatttttcgggacgattacaactctgggacctagaagacact
ctgacaccagtttacaaaaccacagtgcacactgctgtaatcaattcaatagatggagtg
gcaggccaaagcgctaactgtggagctccagaaattgttactggttcccgcgacggttgt
gtaatggtatgggatgtgcgccaaaaagacattccagtggcaaaattcactcctttagaa
ggccaagcaggcagagactgttggtgcgtggcttttggaaattcctacaacgacactgaa
aggatagtagctgcaggatatgataatggagacgttaaattgtttgacttgaaaactatg
agcgtacgatggacgaaatgccttaaaaatgggattgtcgatttgcaatttgatcgcaaa
gatataccgatgaacaaactggtggccaccacgttggaatctaaatttttctgtttcgat
gtacgcactcaacatccaaaaaaaggctttgcgcatttaatagaaaatgcgcatgcatct
acaatttggcaagtaaagcatctgccgcaaaatcgagaaatctttatgactaccggaggc
ggtggatctttgtgtttatggaaatatacatatccaccaaaaagagtagagaaagactct
gaaggtatccaatatggaataatgggtgaattacaccaaatacaaaacagtggactttct
gatcaaccgataacggcttttgattggtgcgtggataaattgggccttgcagtgtgttca
gcttacgatcagactttaagagttctgataacgaccaaactgaatttatgctaa
>Smad
atggaatctttagcgaaaccccaaataatttgtcacaatcagaaatcattagattatgcg
attcacgatgtgaaatggataccttgttctgcgaaatttatttctgtagggggaaaatca
aacggagcgggcatagtagaaatttattcgatatccggggaaggagtggaaaaactggac
gaattttgcaaaaaggatcattttaaatgctgcacattcgatgcttctagcttaaggaat
cggcatttagctactggggacttttcaggacgattgcaactttgggatttggaagacact
ataatgcctgtttataaaactacgactcacactgctgttattaactcaatagacggggta
gcggggcaaagcgccaactgtggagcgcctgaaatagtgacaggttctcgtgatggttgt
gtgatggtttgggacgtgagacagaaggacattccggtagcgaaattcacccccctcgaa
gggcaaagtggacgagattgttggtgcgtagcctttggaaattcttacaacaacgaagag
agggtagtagctgcaggatacgataacggggatgttaaaattttcgatctaaaaaccatg
agcgttcgatggacaaagtgtctaaaaaacggggtggtaaatcttcaattcgaccgaaaa
gacattcccatgaacaaactagtggtgaccaccctggaatcgaaatttttctgcttcgac
gtccgcactcaacatcccaaaaaaggattcgcccacctttccgaaaccgcacacgcctct
acgatatggcaagtgaaacacttgcctcagaacagagaaattttcatgacgaccggtggt
agtgggtctttgtgtttatggaagtacaattacccaatcaaaagggttgaaaaagattct
gaaggaattccatatggaatcataggtgacgtacaacaactccaaaacagtgccctgtct
gaacaacccatcactgcttttgactggtgtgttgacaaactaggtctagctgtgtgctca
gcatatgaccaaaccttgagagttttaataactactaaattgaacttatattag改めてOGのCDSを取るため、~/tools/for_paml/data/241008_SCOディレクトリを作りExOG.pyを書いた。
### ExOG.pyの中身
# ファイルパスの設定
orthogroups_file_path = '/home/kosukesano/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Orthogroups/Orthogroups.txt'
single_copy_orthologues_file_path = '/home/kosukesano/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Orthogroups/Orthogroups_SingleCopyOrthologues.txt'
output_file_path = '/home/kosukesano/tools/for_paml/data/241008_SCO/extracted_orthogroups.txt'
# シングルコピーオルソログのIDをセットに格納
single_copy_orthologues = set()
with open(single_copy_orthologues_file_path, 'r') as single_copy_file:
for line in single_copy_file:
single_copy_orthologues.add(line.strip())
# Orthogroups.txt から該当する行を抽出して新しいファイルに保存
with open(orthogroups_file_path, 'r') as orthogroups_file, open(output_file_path, 'w') as output_file:
for line in orthogroups_file:
# 行の最初の部分を取り出してIDをチェック
og_id = line.split(':')[0].strip()
if og_id in single_copy_orthologues:
output_file.write(line)これを実行すると、extracted_orthogroups.txtができる。
OG0008033: Agra_P_050292688.1 Cass_AG9767834.1 Dpon_P_019769583.1 Smad_g5339.t1 Sory_P_030760502.1 Tcas_P_001812254.1
OG0008034: Agra_P_050292700.1 Cass_AG9761214.1 Dpon_P_019755574.2 Smad_g6358.t1 Sory_P_030761209.1 Tcas_P_008195282.1
OG0008035: Agra_P_050292731.1 Cass_AH1135743.1 Dpon_P_048519923.1 Smad_g2098.t1 Sory_P_030765758.1 Tcas_P_008196870.1
OG0008036: Agra_P_050292732.1 Cass_AG9767756.1 Dpon_P_019773495.1 Smad_g5269.t1 Sory_P_030765067.1 Tcas_P_015836383.1
OG0008037: Agra_P_050292739.1 Cass_AG9768060.1 Dpon_P_019769194.2 Smad_g11904.t1 Sory_P_030755089.1 Tcas_P_969265.1
OG0008039: Agra_P_050292743.1 Cass_AG9767942.1 Dpon_P_019767966.1 Smad_g4980.t1 Sory_P_030750408.1 Tcas_P_971491.1
OG0008040: Agra_P_050292768.1 Cass_AH1123990.1 Dpon_P_048523285.1 Smad_g10276.t1 Sory_P_030759374.1 Tcas_P_975603.1
OG0008041: Agra_P_050292798.1 Cass_AG9770235.1 Dpon_P_019769671.2 Smad_g12750.t1 Sory_P_030747529.1 Tcas_P_971970.1
OG0008042: Agra_P_050292813.1 Cass_AG9770251.1 Dpon_P_019769634.2 Smad_g5261.t1 Sory_P_030747567.1 Tcas_P_968688.1
OG0008043: Agra_P_050292817.1 Cass_AG9770190.1 Dpon_P_019769690.1 Smad_g5262.t1 Sory_P_030747568.1 Tcas_P_968766.1
OG0008044: Agra_P_050292828.1 Cass_AG9770237.1 Dpon_P_019769698.1 Smad_g7152.t1 Sory_P_030747658.1 Tcas_P_008190584.1
OG0008045: Agra_P_050292879.1 Cass_AG9762270.1 Dpon_P_019773117.1 Smad_g12600.t1 Sory_P_030759522.1 Tcas_P_972888.1
OG0008046: Agra_P_050292889.1 Cass_AG9762382.1 Dpon_P_019753344.1 Smad_g8693.t1 Sory_P_030760073.1 Tcas_P_008195985.1kosukesano@at138:~/tools/for_paml/data$ mv /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/*fasta 6sp_nama_data/
kosukesano@at138:~/tools/for_paml/data$ ls 6sp_nama_data/
Agra.fasta Agra_mt.fasta Cass.fasta Dmel_mt.fasta Dpon.fasta Dpon_mt.fasta Smad.fasta Sory.fasta Sory_mt.fasta Tcas.fasta Tcas_mt.fasta query.fasta
kosukesano@at138:~/tools/for_paml/data$ cd 6sp_nama_data/
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls
Agra.fasta Agra_mt.fasta Cass.fasta Dmel_mt.fasta Dpon.fasta Dpon_mt.fasta Smad.fasta Sory.fasta Sory_mt.fasta Tcas.fasta Tcas_mt.fasta query.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ rm *_mt.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls
Agra.fasta Cass.fasta Dpon.fasta Smad.fasta Sory.fasta Tcas.fasta query.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less query.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ rm query.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$間違えて.fastaファイルをrmしちゃったので、もう一度もとのファイルからコピーしてきた。
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ python edit.py
../6sp_nama_data/Tcas.fasta に保存しました。
../6sp_nama_data/Agra.fasta に保存しました。
../6sp_nama_data/Smad.fasta に保存しました。
../6sp_nama_data/Cass.fasta に保存しました。
../6sp_nama_data/Dpon.fasta に保存しました。
../6sp_nama_data/Sory.fasta に保存しました。
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls
Agra.fasta Cass.fasta Dpon.fasta Smad.fasta Sory.fasta Tcas.fasta edit.py
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less Agra.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less Cass.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less Dpon.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ less Smad.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ rm *.fasta
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/*fasta
ls: cannot access '/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/*fasta': No such file or directory
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls /home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir/
Agra.fasta_blast_results.txt Cass.fasta_db.nin Dpon.fasta_db.nsq Smad.fasta_db.nto Tcas.fasta_db.nhr
Agra.fasta_db.ndb Cass.fasta_db.njs Dpon.fasta_db.ntf Sory.fasta_blast_results.txt Tcas.fasta_db.nin
Agra.fasta_db.nhr Cass.fasta_db.not Dpon.fasta_db.nto Sory.fasta_db.ndb Tcas.fasta_db.njs
Agra.fasta_db.nin Cass.fasta_db.nsq Dpon_mt.fasta_blast_results.txt Sory.fasta_db.nhr Tcas.fasta_db.not
Agra.fasta_db.njs Cass.fasta_db.ntf Dpon_mt.fasta_db.ndb Sory.fasta_db.nin Tcas.fasta_db.nsq
Agra.fasta_db.not Cass.fasta_db.nto Dpon_mt.fasta_db.nhr Sory.fasta_db.njs Tcas.fasta_db.ntf
Agra.fasta_db.nsq Dmel_mt.fasta_blast_results.txt Dpon_mt.fasta_db.nin Sory.fasta_db.not Tcas.fasta_db.nto
Agra.fasta_db.ntf Dmel_mt.fasta_db.ndb Dpon_mt.fasta_db.njs Sory.fasta_db.nsq Tcas_mt.fasta_blast_results.txt
Agra.fasta_db.nto Dmel_mt.fasta_db.nhr Dpon_mt.fasta_db.not Sory.fasta_db.ntf Tcas_mt.fasta_db.ndb
Agra_mt.fasta_blast_results.txt Dmel_mt.fasta_db.nin Dpon_mt.fasta_db.nsq Sory.fasta_db.nto Tcas_mt.fasta_db.nhr
Agra_mt.fasta_db.ndb Dmel_mt.fasta_db.njs Dpon_mt.fasta_db.ntf Sory_mt.fasta_blast_results.txt Tcas_mt.fasta_db.nin
Agra_mt.fasta_db.nhr Dmel_mt.fasta_db.not Dpon_mt.fasta_db.nto Sory_mt.fasta_db.ndb Tcas_mt.fasta_db.njs
Agra_mt.fasta_db.nin Dmel_mt.fasta_db.nsq OrthoFinder Sory_mt.fasta_db.nhr Tcas_mt.fasta_db.not
Agra_mt.fasta_db.njs Dmel_mt.fasta_db.ntf Smad.fasta_blast_results.txt Sory_mt.fasta_db.nin Tcas_mt.fasta_db.nsq
Agra_mt.fasta_db.not Dmel_mt.fasta_db.nto Smad.fasta_db.ndb Sory_mt.fasta_db.njs Tcas_mt.fasta_db.ntf
Agra_mt.fasta_db.nsq Dpon.fasta_blast_results.txt Smad.fasta_db.nhr Sory_mt.fasta_db.not Tcas_mt.fasta_db.nto
Agra_mt.fasta_db.ntf Dpon.fasta_db.ndb Smad.fasta_db.nin Sory_mt.fasta_db.nsq co1blast.sh
Agra_mt.fasta_db.nto Dpon.fasta_db.nhr Smad.fasta_db.njs Sory_mt.fasta_db.ntf
Cass.fasta_blast_results.txt Dpon.fasta_db.nin Smad.fasta_db.not Sory_mt.fasta_db.nto
Cass.fasta_db.ndb Dpon.fasta_db.njs Smad.fasta_db.nsq Tcas.fasta_blast_results.txt
Cass.fasta_db.nhr Dpon.fasta_db.not Smad.fasta_db.ntf Tcas.fasta_db.ndb
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ cd ~/tools/
kosukesano@at138:~/tools$ ls
AUGUSTUS_CONFIG_copy Arthropoda.fa EDTA_git_install ProtHint_git_install braker_git_install for_MAFFT for_brakertest for_orthofinder for_softmask
All_AUGUSTUS_test DIAMOND_git_install GeneMarkETP_git_install TSEBRA_git_install for_ASTRAL for_braker for_cafe for_paml pyenv_env
kosukesano@at138:~/tools$ cd ~/tools/for_braker
kosukesano@at138:~/tools/for_braker$ ls
Femo Femo_pilon Kohuki Kohuki_thread_one Madara OnlyProtein_femo OnlyProtein_madara nama_data
kosukesano@at138:~/tools/for_braker$ cd Madara/
kosukesano@at138:~/tools/for_braker/Madara$ ls
BUSCO_OUTPUT_Madara_WITHRNA madara_braker.sh madara_braker.sh.o26149250 madara_braker.sh.pe26149256 madara_braker.zip madara_busco.sh.o26170184
braker madara_braker.sh.e26149250 madara_braker.sh.o26149256 madara_braker.sh.po26149250 madara_busco.sh madara_busco.sh.pe26170184
busco_downloads madara_braker.sh.e26149256 madara_braker.sh.pe26149250 madara_braker.sh.po26149256 madara_busco.sh.e26170184 madara_busco.sh.po26170184
kosukesano@at138:~/tools/for_braker/Madara$ cd braker/
kosukesano@at138:~/tools/for_braker/Madara/braker$ ls
Augustus braker.aa braker.codingseq.zip braker.log errors hintsfile.gff t1tyusyutu.py
GeneMark-ETP braker.codingseq braker.gtf braker_t1_sequences.aa genome_header.map species what-to-cite.txt
kosukesano@at138:~/tools/for_braker/Madara/braker$ cp braker.codingseq ~/tools/for_paml/data/6sp_nama_data/Smad.fasta
kosukesano@at138:~/tools/for_braker/Madara/braker$ less ~/tools/for_paml/data/6sp_nama_data/Smad.fasta
kosukesano@at138:~/tools/for_braker/Madara/braker$ cd
kosukesano@at138:~$ ls
Desktop local mafft_plusname.sh.e26313496 mafft_plusname.sh.o26313501 manualphilo.sh.o26271286 manualphylo.sh.o26819185 reference_sequence rst1
bsAtest.sh.e26312004 mafft.sh.e26293911 mafft_plusname.sh.e26313501 manualphilo.sh.e26271286 manualphilo.sh.o26837716 old_envilonment_until20240430 results_sh_eando rub
bsAtest.sh.o26312004 mafft.sh.o26293911 mafft_plusname.sh.o26313496 manualphilo.sh.e26837716 manualphylo.sh.e26819185 pyenv_conda_environment rst tools
kosukesano@at138:~$ cd reference_sequence/
kosukesano@at138:~/reference_sequence$ ls
Ecoli merge_rbh.py rbh.sh rbh.sh.e26240603 rbh.sh.o26240603 rbh.sh.pe26240603 rbh.sh.po26240603
Madara merged_best_hits.txt rbh.sh.e26231590 rbh.sh.o26231590 rbh.sh.pe26231590 rbh.sh.po26231590 rbh_result.txt
Sory_Tcas_Dmel_Ecol_ref merged_best_hits_with_function.txt rbh.sh.e26231593 rbh.sh.o26231593 rbh.sh.pe26231593 rbh.sh.po26231593 reciprocal_best_hits_Dmel.txt
addfunction_test.py new_rbh.py rbh.sh.e26231600 rbh.sh.o26231600 rbh.sh.pe26231600 rbh.sh.po26231600 reciprocal_best_hits_Ecol.txt
addproduct_test.py out_Dmel_blastp_RefAsMadara.txt rbh.sh.e26231603 rbh.sh.o26231603 rbh.sh.pe26231603 rbh.sh.po26231603 reciprocal_best_hits_Sory.txt
addproduct_test2.py out_Ecol_blastp_RefAsMadara.txt rbh.sh.e26231610 rbh.sh.o26231610 rbh.sh.pe26231610 rbh.sh.po26231610 reciprocal_best_hits_Tcas.txt
blastp_4sp_test.sh out_Sory_blastp_RefAsMadara.txt rbh.sh.e26237227 rbh.sh.o26237227 rbh.sh.pe26237227 rbh.sh.po26237227 reciprocal_best_hits_madara.txt
blastp_RefAsMadara.sh out_Tcas_blastp_RefAsMadara.txt rbh.sh.e26237813 rbh.sh.o26237813 rbh.sh.pe26237813 rbh.sh.po26237813
functional_annotation out_madara_as_ref_blastp_.txt rbh.sh.e26237904 rbh.sh.o26237904 rbh.sh.pe26237904 rbh.sh.po26237904
gene_function.txt out_madara_blastp_test.txt rbh.sh.e26238740 rbh.sh.o26238740 rbh.sh.pe26238740 rbh.sh.po26238740
makedic_test.py rbh.py rbh.sh.e26238754 rbh.sh.o26238754 rbh.sh.pe26238754 rbh.sh.po26238754
kosukesano@at138:~/reference_sequence$ cd ../old_envilonment_until20240430/
kosukesano@at138:~/old_envilonment_until20240430$ ls
EDTA GeMoMa_temp busco_downloads cafetest gall leaf_beetle other_weevil outgroup paml_test ronbun_sp
kosukesano@at138:~/old_envilonment_until20240430$ cd other_weevil/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil$ ls
Anthonomus_grandis_grandis Ceutorhynchus_assimilis Cylas_formicarius Nicrophorus_vespilloides Soryzae
kosukesano@at138:~/old_envilonment_until20240430/other_weevil$ cd Anthonomus_grandis_grandis/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis$ ls
README.md 'download?include_annotation_type=GENOME_FASTA,GENOME_GFF,RNA_FASTA,CDS_FASTA,PROT_FASTA,SEQUENCE_REPORT' ncbi_dataset
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis$ cd ncbi_dataset/data/GCF_022605725.1/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1$ ls
Anthonomus_buscotest.sh Anthonomus_buscotest.sh.o25642658 Anthonomus_buscotest.sh.po25642658 busco_downloads cds_from_genomic.fna protein.faa sequence_report.jsonl
Anthonomus_buscotest.sh.e25642658 Anthonomus_buscotest.sh.pe25642658 GCF_022605725.1_icAntGran1.3_genomic.fna busco_out genomic.gff rna.fna
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Agra.fasta
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1$ less ~/tools/for_paml/data/6sp_nama_data/Agra.fasta
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1$ cd ../../../../Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ ls
Ceutorhynchus_buscotest.sh Ceutorhynchus_buscotest.sh.o25642655 Ceutorhynchus_buscotest.sh.po25642655 busco_downloads cds_from_genomic.fna protein.faa
Ceutorhynchus_buscotest.sh.e25642655 Ceutorhynchus_buscotest.sh.pe25642655 GCA_917834065.1_PGI_CEUTPL_v4_genomic.fna busco_out genomic.gff sequence_report.jsonl
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Cass.fasta
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ cd ../../../../Soryzae/ncbi_dataset/data/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data$ ls
GCA_002938485.2 GCF_002938485.1 assembly_data_report.jsonl data_summary.tsv dataset_catalog.json
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data$ cd GCF_002938485.1/
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1$ ls
GCF_002938485.1_Soryzae_2.0_genomic.fna Soryzae_busco.sh.e26203344 Soryzae_busco.sh.pe26203344 busco_downloads cds_from_genomic.fna genomic.gff protein.faa sequence_report.jsonl
Soryzae_busco.sh Soryzae_busco.sh.o26203344 Soryzae_busco.sh.po26203344 busco_out genomic.gbff genomic.gtf rna.fna
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Sory.fasta
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1$ cd ../../../../
kosukesano@at138:~/old_envilonment_until20240430/other_weevil$ cd ../
kosukesano@at138:~/old_envilonment_until20240430$ ls
EDTA GeMoMa_temp busco_downloads cafetest gall leaf_beetle other_weevil outgroup paml_test ronbun_sp
kosukesano@at138:~/old_envilonment_until20240430$ ls outgroup/
Drosophila_melanogaster Tribolium_castaneum
kosukesano@at138:~/old_envilonment_until20240430$ ls ronbun_sp/
Dendroctonus_ponderosae Drosophila_melanogaster Orthotest Rhynchophorus_ferrugineus Tribolium_castaneum cds_matome pep_matome test three_sp_cds_matome
kosukesano@at138:~/old_envilonment_until20240430$ cd ronbun_sp/
kosukesano@at138:~/old_envilonment_until20240430/ronbun_sp$ ls
Dendroctonus_ponderosae Drosophila_melanogaster Orthotest Rhynchophorus_ferrugineus Tribolium_castaneum cds_matome pep_matome test three_sp_cds_matome
kosukesano@at138:~/old_envilonment_until20240430/ronbun_sp$ cd Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1/
kosukesano@at138:~/old_envilonment_until20240430/ronbun_sp/Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Dpon.fasta
kosukesano@at138:~/old_envilonment_until20240430/ronbun_sp/Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1$ cd ../../../../../outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3/
kosukesano@at138:~/old_envilonment_until20240430/outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3$ cp cds_from_genomic.fna ~/tools/for_paml/data/6sp_nama_data/Tcas.fasta
kosukesano@at138:~/old_envilonment_until20240430/outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3$ cd ~/tools/for_paml/data/6sp_nama_data/
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ ls
Agra.fasta Cass.fasta Dpon.fasta Smad.fasta Sory.fasta Tcas.fasta edit.py
kosukesano@at138:~/tools/for_paml/data/6sp_nama_data$ CDS配列ファイルのヘッダーだけproteinにするスクリプト
### ch_hed.py
from Bio import SeqIO
# ファイルパスの設定
paml_fasta = "/home/kosukesano/tools/for_paml/data/6sp_nama_data/Agra.fasta"
orthofinder_fasta = "/home/kosukesano/tools/for_orthofinder/Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/Agra.fasta"
output_fasta = "/home/kosukesano/tools/for_paml/data/6sp_nama_data/changehedder/Agra_changehedder.fasta"
# orthofinderのファイルからprotein_IDをキーにした辞書を作成
orthofinder_dict = {}
for record in SeqIO.parse(orthofinder_fasta, "fasta"):
protein_id = record.id.split()[0] # protein_IDは最初のスペースまでの部分
print(f"Extracted orthofinder_protein_ID: {protein_id}")
orthofinder_dict[protein_id] = record
# 出力用リスト
output_records = []
# pamlのファイルを処理
for record in SeqIO.parse(paml_fasta, "fasta"):
header_parts = record.description.split("protein_id=")
if len(header_parts) > 1:
protein_id = header_parts[1].split("]")[0] # protein_idを抽出
print(f"Extracted CDS_protein_ID: {protein_id}") # 抽出したprotein_IDを出力
# orthofinderファイルで一致するprotein_IDがあるか確認
if protein_id in orthofinder_dict:
print(f"Match found for protein_ID: {protein_id}")
# 一致する場合、ヘッダーを置き換え
new_header = orthofinder_dict[protein_id].description
record.description = new_header
else:
print(f"No match found for protein_ID: {protein_id}")
else:
print(f"protein_ID not found in header: {record.description}")
# 出力リストに追加
output_records.append(record)
# 新しいファイルに書き出し
SeqIO.write(output_records, output_fasta, "fasta")
print(f"Modified fasta file saved to: {output_fasta}")### ~/tools/for_paml/data/6sp_nama_data/changehedder/edit.pyの中身
import os
from Bio import SeqIO
# 入力ディレクトリと出力ディレクトリのパス
input_dir = '../changehedder/'
output_dir = '../RemakeHedder_6sp_afterchange/'
# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 入力ディレクトリ内のすべての .fasta ファイルを処理
for input_file in os.listdir(input_dir):
if input_file.endswith('.fasta'):
input_path = os.path.join(input_dir, input_file)
output_path = os.path.join(output_dir, input_file)
# 入力ファイルを読み込み、条件に基づいて書き換えた内容を出力ファイルに保存
with open(output_path, 'w') as outfile:
for record in SeqIO.parse(input_path, 'fasta'):
header = record.description
seq = str(record.seq)
# ヘッダーが「g」で始まる場合
if header.startswith("g"):
# 新しいヘッダーは「>Smad」 + 「元のヘッダーの番号」
number = header.split()[0] # ヘッダーの最初の番号部分を取得
new_header = f">Smad_{number}"
# ヘッダーが「]」で終わる場合
elif header.endswith("]"):
# ヘッダーの最後の「[]」内の英字を抽出
within_brackets = header.split('[')[-1].split(']')[0]
first_letter = within_brackets[0] # 最初の1文字
space_after = within_brackets.split()[-1][:3] # スペース後の3文字
# 元のヘッダーから最初の「>」の次の文字から最初の「 」までの部分を取得
first_part = header.split()[1][1:]
new_header = f">{first_letter}{space_after}_{first_part}"
else:
new_header = f">{header.split()[0]}"
# 新しいヘッダーと配列を出力ファイルに書き込む
outfile.write(f"{new_header}\n{seq}\n")
print(f"{output_path} に保存しました。")1009
CDS取得の続き
昨日のch_hedder.pyを全種分行い、~/tools/for_paml/data/6sp_nama_data/changehedder/に保存した。
kosukesano@at139:~/tools/for_paml/data$ ls 6sp_nama_data/changehedder/
Agra_changehedder.fasta Dpon_changehedder.fasta Sory_changehedder.fasta ch_hed.py makedf.py
Cass_changehedder.fasta Smad_changehedder.fasta Tcas_changehedder.fasta edit.py protein_headers.csvその後、edit.pyを実行し、ファイルを~/tools/for_paml/data/241009_RemakeHedder_6sp_afterchange/に保存した。
kosukesano@at139:~/tools/for_paml/data/6sp_nama_data/changehedder$ python edit.py
../../241009_RemakeHedder_6sp_afterchange/Sory_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Dpon_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Tcas_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Smad_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Cass_changehedder.fasta に保存しました。
../../241009_RemakeHedder_6sp_afterchange/Agra_changehedder.fasta に保存しました。
kosukesano@at139:~/tools/for_paml/data/6sp_nama_data/changehedder$ cd ../../上記のファイルを使用し、SCOを抽出する。そのスクリプト~/tools/for_paml/data/241008_SCO/new_makefna.pyは以下の通り。
### new_makefna.pyの中身
# 必要なモジュールをインポート
import os
# ファイルパスの設定
orthogroups_file = "extracted_orthogroups.txt"
input_dir = "../241009_RemakeHedder_6sp_afterchange/"
output_dir = "/home/kosukesano/tools/for_paml/data/CDS_SCO/"
# ディレクトリが存在しない場合、作成
os.makedirs(output_dir, exist_ok=True)
# OG番号と遺伝子IDをextracted_orthogroups.txtから取得
with open(orthogroups_file, "r") as ortho_f:
for line in ortho_f:
if line.strip(): # 空行を無視
# 行をOG番号と遺伝子IDに分割
og_number, gene_ids_str = line.split(":")
og_number = og_number.strip()
gene_ids = gene_ids_str.strip().split()
# 遺伝子IDを種ごとに分割
genes = {
"Agra": gene_ids[0],
"Cass": gene_ids[1],
"Dpon": gene_ids[2],
"Smad": gene_ids[3],
"Sory": gene_ids[4],
"Tcas": gene_ids[5]
}
# 出力ファイルのパスを設定
output_file = os.path.join(output_dir, f"{og_number}.fna")
# 出力ファイルを開く
with open(output_file, "w") as out_f:
# 各種ごとに遺伝子IDを取得し、対応するファイルからシーケンスを検索
for species, gene_id in genes.items():
fasta_file = os.path.join(input_dir, f"{species}_changehedder.fasta")
with open(fasta_file, "r") as fasta_f:
write_flag = False
for line in fasta_f:
if line.startswith(f">{gene_id}"):
# ヘッダー行を見つけたら、出力ファイルに書き込みを開始
out_f.write(line)
print(line.strip()) # 標準出力にヘッダーを表示
write_flag = True
elif line.startswith(">") and write_flag:
# 次のヘッダー行が見つかったら、現在の遺伝子の書き込みを終了
write_flag = False
elif write_flag:
# シーケンス部分を書き込む
out_f.write(line)
print(line.strip()) # 標準出力にシーケンスを表示
print(f"{og_number}.fna ファイルが {output_dir} に保存されました。")5分くらいで終わる。
次にこれらのCDSをアライメントする。~/tools/for_paml/data/241008_SCOでmafft.shを作成、qsubで投げた。
### mafft.sh
#$ -S /bin/bash
source ~/tools/pyenv_env/ManualPhilo_profile
# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO/"
output_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO/"
# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
# 元のファイル名から拡張子を除いたものを取得
base_name=$(basename "$file" .fna)
# 出力ファイル名を生成
output_file="${output_dir}${base_name}_maffted.fna"
# MAFFTを実行
mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"
echo "Aligned file created: $output_file"
doneこっちは結構時間かかる。
この後、ヘッダーを種名のみにする必要があった。そのためのスクリプトfix.pyを~/tools/for_paml/data/CDS_SCOに作成。
### fix.pyの中身
import os
# 対象ディレクトリ
input_dir = "/home/kosukesano/tools/for_paml/data/CDS_SCO"
# ディレクトリ内の_maffted.fnaファイルを処理
for filename in os.listdir(input_dir):
if filename.endswith("_maffted.fna"):
input_filepath = os.path.join(input_dir, filename)
output_filename = filename.replace("_maffted.fna", "_maffted_fixed.fna")
output_filepath = os.path.join(input_dir, output_filename)
with open(input_filepath, 'r') as infile, open(output_filepath, 'w') as outfile:
for line in infile:
if line.startswith(">"):
# ヘッダーの「>」とその後の4文字に置き換える
new_header = ">" + line[1:5] + "\n"
outfile.write(new_header)
else:
# 配列行はそのまま書き込む
outfile.write(line)
print("ヘッダー置き換え処理が完了しました。")scorpionでのEkamソフトマスク続き
dendezia@scorpion:~$ ls
RM_671004.MonOct70853342024 old_envilonment_until20241004 pyenv_conda_environment tool
dendezia@scorpion:~$ ls RM_671004.MonOct70853342024/
consensi.fa consensi.fa.classified families-classified.stk families.stk round-1 round-2 round-3 round-4 round-5 round-6 tmpConsensi.faできてるけど、出力ファイルのRM_671004.MonOct70853342024/がホームディレクトリに行っちゃってる。
mvで移動させた。
dendezia@scorpion:~$ ls
RM_671004.MonOct70853342024 old_envilonment_until20241004 pyenv_conda_environment tool
dendezia@scorpion:~$ mv RM_671004.MonOct70853342024/ ~/tool/for_softmask/Ekam_softmask
dendezia@scorpion:~$ ls ~/tool/for_softmask/Ekam_softmask
Ekam_BLAST_DATABASE-families.fa Ekam_BLAST_DATABASE.njs Ekam_BLAST_DATABASE.nsq Ekam_RepeatModeler.sh.e2016 Ekam_RepeatModeler.sh.o2016
Ekam_BLAST_DATABASE-families.stk Ekam_BLAST_DATABASE.nnd Ekam_BLAST_DATABASE.translation Ekam_RepeatModeler.sh.e2017 Ekam_RepeatModeler.sh.o2017
Ekam_BLAST_DATABASE.nhr Ekam_BLAST_DATABASE.nni Ekam_RepeatModeler.sh Ekam_RepeatModeler.sh.e2018 Ekam_RepeatModeler.sh.o2018
Ekam_BLAST_DATABASE.nin Ekam_BLAST_DATABASE.nog Ekam_RepeatModeler.sh.e2015 Ekam_RepeatModeler.sh.o2015 RM_671004.MonOct70853342024
dendezia@scorpion:~$ 続いてRepeatMaskerを行う。~/tool/for_softmask/Ekam_softmaskにてEkam_RepeatMasker.shを作成し、qsubで投げた。
### Ekam_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
cd /home/dendezia/tool/for_softmask/Ekam_softmask/
source /home/dendezia/tool/pyenv_env/EDTA_profile
RepeatMasker -pa 6 -lib\
/home/dendezia/tool/for_softmask/Ekam_softmask/RM_671004.MonOct70853342024/consensi.fa.classified\
/home/dendezia/tool/for_softmask/nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna
date結果
dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ ls ../nama_data/Ekam_dataset/data/GCA_014849505.1/
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.tbl sequence_report.jsonl
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.cat.gz GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.out genomic.gbff
dendezia@scorpion:~/tool/for_softmask/Ekam_softmask$ 出力ファイルのGCA_014849505.1_AAL_Ekam_1.0_genomic.fna.cat.gzができた!
続いてProcessRepeatsを行う。~/tool/for_softmask/Ekam_softmaskにEkam_ProcessRepeats.shを作成した。
### Ekam_ProcessRepeats.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
cd /home/dendezia/tool/for_softmask/Ekam_softmask/
source /home/dendezia/tool/pyenv_env/EDTA_profile
ProcessRepeats\
-maskSource /home/dendezia/tool/for_softmask/nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna\
-xsmall\
-gff\
/home/dendezia/tool/for_softmask/nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.cat.gz
dateこれをqsubで投げた
IQ-TREE系統樹を使ったPAMLやり直し
~/tools/for_paml/241009_IQTREE_6sp下で/bsA/ディレクトリと/bs_null/ディレクトリを作成した。
bsAについて
~/tools/for_paml/241009_IQTREE_6sp/bsAでbsA_IQTREE_paml.shとtemplate.ctlを作成、bsA_IQTREE_paml.shの方をqsubで投げた。
### ~/tools/for_paml/241009_IQTREE_6sp/bsA/bsA_IQTREE_paml.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_branch_alt"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### ~/tools/for_paml/241009_IQTREE_6sp/bsA/template.ctlの中身
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/IQTREE_6sp/data/new_tree_IQTREE_ultrametric.nwk
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0bs_nullについて
~/tools/for_paml/241009_IQTREE_6sp/bs_nullでbsN_IQTREE_paml.shとbsN_template.ctlを作成、bsN_IQTREE_paml.shの方をqsubで投げた。
### ~/tools/for_paml/241009_IQTREE_6sp/bs_null/bsN_IQTREE_paml.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bs_null"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/bsN_template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_branch_alt"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### ~/tools/for_paml/241009_IQTREE_6sp/bs_null/bsN_template.ctlの中身
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/IQTREE_6sp/data/new_tree_IQTREE_ultrametric.nwk
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 1
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0ASTRAL系統樹を使ったPAMLやり直し
bsAについて
~/tools/for_paml/241009_ASTRAL_6sp/bsAでbsA_ASTRAL_paml.shとtemplate.ctlを作成、bsA_ASTRAL_paml.shの方をqsubで投げた。
### ~/tools/for_paml/241009_ASTRAL_6sp/bsA/bsA_ASTRAL_paml.shの中身
### run_paml.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_branch_alt"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### ~/tools/for_paml/241009_ASTRAL_6sp/bsA/template.ctlの中身
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0bs_nullについて
~/tools/for_paml/241009_ASTRAL_6sp/bs_nullでbsN_ASTRAL_paml.shとbsN_template.ctlを作成、bsN_ASTRAL_paml.shの方をqsubで投げた。
### ~/tools/for_paml/241009_ASTRAL_6sp/bs_null/bsN_ASTRAL_paml.shの中身
### run_paml.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/bs_null"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/bsN_template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_branch_alt"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### ~/tools/for_paml/241009_ASTRAL_6sp/bs_null/bsN_template.ctlの中身
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 1
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 01010
scorpionでのEkamソフトマスク結果
Ekam_ProcessRepeats.shが終わり、~/tool/for_softmask//nama_data/Ekam_dataset/data/GCA_014849505.1/ディレクトリにGCA_014849505.1_AAL_Ekam_1.0_genomic.fna.maskedが出力された。
### GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.maskedの中身の一部
>JACGEL010000001.1 Elaeidobius kamerunicus isolate PL Ekam 1 scaffold-1, whole genome shotgun sequence
taaaacaataaaaatactttattttaatattcaatattgtattaatatat
aacttaattttctctattttaactaattttcaaACCCCTAACATGTTTTC
CAGTGAGccgctaaaaaaatatcacaaaatgaactttaagtttaagttGG
AAATTTAAGACTTGAAGCTAGCTAGGATGAGTCGNNNNNNNNNNNNNNNN
NNNNNNNNNNNNNNNNNNNNNNAAATACATCTTTGATTTGTAAGTATctg
tagtatatttttggaataaaatagtttattaaatatatttcggtTTTCCT
TTTCCCGTAGGACGTTGCAAAGTGGCGACgaggatttttatatttcccta
GAAAAATAGAACCCCCTAGTTGGGAAAATTAGTGGGTTTCTAAAATTCCG
GTAAAGTAAGAAAACGTGTAGTGTAGTGTGCAGATAGAATTTGaccctaa
aataaatgattggACTGTGCACATAAATCGTCTGATGATTCTATAAACAG
ACCAAAAAGAGTAATTTTACTCAATGGGCTTGCTCAAGAACCGTATATAT
TGCTACAAAACTTAAGTCTTTCAACGAAACCTTCTGAAGCTACTTACTAG
GACCTTCTCAAGTACTTTAATAGCTATTTAAAGTTTTCCGATTACAAGGA
CTTCGATGAGGTCGAGGTAGAGAAAACGCTGGCAACCGTGGAGGCCGAGT
GCATTTTGGAGGCGGTACTAGTGACCGGTCGGGCAGCGCAGGAACGAAAA
AATCAAGTGATAGTGATGTGTTCAATCTGTCGAAAAAGTAAACATTCCga
aaacaaatgttttcatCGTAATTTTAACAGGTTTTGCAGCTTCTGCAAAC
TAAAGCACATAATACagtaaactgtaaaaataaaatggacattgaacaaa
ataccaataatGACAATgtgaatgatttaaattttaatataaatttaaat
gaatttccgGTCTATACCACATAACATTTCTAGTcctattgaaatatttt
ttaaataatagtctGTATAATTTTGAACTGATTCAGGTGCAGTACTTTCG
TGTACACCCTATTCGAtgtatgcaaattatttttgagatattttctTGAT
TAAAACTTATGTAACATGATTAATTTGAGTGGTAGAATAATTTCACCAAT
TGGTCAAGTTGTCCTAAAGctggaatataataaacaagtttCGAATTTAA
.
.
.
.
.ソフトマスクに治ってる部分もあるけど、ハードマスクのままの部分もある。どういうことだ?
ASTRAL系統樹を使ったPAMLやり直しの続き
~/tools/for_paml/241009_ASTRAL_6spディレクトリでbs_lrp.pyを作成し、実行した。
### ~/tools/for_paml/241009_ASTRAL_6sp/bs_lrp.pyの中身
import os
import re
from scipy.stats import chi2
def parse_lnL(file_path):
with open(file_path, 'r') as f:
for line in f:
match = re.search(r'lnL\(ntime: \d+ np: (\d+)\):\s+(-?\d+\.\d+)', line)
if match:
np = int(match.group(1))
lnL = float(match.group(2))
return np, lnL
return None, None
def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
lr_stat = 2 * (alt_lnL - null_lnL)
df = alt_np - null_np
p_val = chi2.sf(lr_stat, df)
return p_val
def main():
alt_dir = '/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/bsA/result'
null_dir = '/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/bs_null/result'
output_file = 'branch_site_lrt_results.txt'
alt_dir = os.path.expanduser(alt_dir)
null_dir = os.path.expanduser(null_dir)
og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_branch_alt' in f]
with open(output_file, 'w') as out_f:
out_f.write('OG_num\tp_val\tpositive_selection\n')
for og_file in og_files:
og_num = og_file.split('_')[0]
alt_file = os.path.join(alt_dir, og_file)
null_file = os.path.join(null_dir, og_file)
if os.path.exists(null_file):
alt_np, alt_lnL = parse_lnL(alt_file)
null_np, null_lnL = parse_lnL(null_file)
if alt_np is not None and null_np is not None:
p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
reject_null = '+' if p_val < 0.05 else '-'
out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')
if __name__ == "__main__":
main()これを実行したところ、branch_site_lrt_results.txtができた。ローカルに送ってFDRにかける。
:~/bio/for_paml/241010$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/branch_site_lrt_results.txt /Users/kosukesano/bio/for_paml/241010/ASTRAL_branch_site_
lrt_results.txt
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
branch_site_lrt_results.txt 100% 27KB 684.1KB/s 00:00
:~/bio/for_paml/241010$ FDRにかけた結果は以下の通り
AST=read.csv("/Users/kosukesano/bio/for_paml/241010/hosei_ASTRAL_branch_site_lrt_results.txt", sep="\t")|>
dplyr::filter(significant == "True")
orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"
# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim(orthogroups_file, header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")
AST_2=dplyr::left_join(AST, orthogroups, by = c(OG_num = "V1"))|>
rename(gene_ID = V5)|>
dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", ""))
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
AST_3=dplyr::left_join(AST_2, fa, by = c(gene_ID = "Madara"))
deg1=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")
deg2=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_Adult_vs_Larva_DESeq2.csv", sep=",")
deg_all=dplyr::full_join(deg1, deg2, by = "gene_ID")
AST_4=dplyr::left_join(AST_3, deg_all, by = "gene_ID")|>###完成系
dplyr::select(gene_ID, q_val, Sory_GeneFunction, ovary.body_log2FC, ovary.body_adjPval, adult.llarva_log2FC, adult.llarva_adjPval, adult.mlarva_log2FC, adult.mlarva_adjPval) |>
print() gene_ID q_val
1 g2313.t1 3.310264e-02
2 g3206.t1 1.004678e-13
3 g12267.t1 2.607850e-02
4 g9945.t1 3.980078e-15
5 g4236.t1 8.062344e-03
6 g10787.t1 7.040355e-03
7 g6829.t1 1.627589e-02
8 g8864.t1 9.034552e-04
Sory_GeneFunction
1 protein PTCD3 homolog, mitochondrial
2 LOW QUALITY PROTEIN: cell division cycle and apoptosis regulator protein 1-like
3 D-glucuronyl C5-epimerase B
4 laminin subunit alpha
5 acyl-CoA dehydrogenase family member 9, mitochondrial
6 probable tRNA N6-adenosine threonylcarbamoyltransferase, mitochondrial
7 methyltransferase-like protein 17, mitochondrial
8 importin subunit alpha-3
ovary.body_log2FC ovary.body_adjPval adult.llarva_log2FC adult.llarva_adjPval
1 0.8232494 1.43e-13 NA NA
2 NA NA NA NA
3 NA NA NA NA
4 -1.6162768 9.89e-06 NA NA
5 NA NA NA NA
6 2.5426042 3.48e-17 NA NA
7 1.8062424 4.66e-14 NA NA
8 NA NA NA NA
adult.mlarva_log2FC adult.mlarva_adjPval
1 NA NA
2 NA NA
3 NA NA
4 NA NA
5 NA NA
6 NA NA
7 NA NA
8 NA NA
8つの遺伝子で正の選択が検出された
IQ-TREE系統樹を使ったPAMLやり直しの続き
~/tools/for_paml/241009_IQTREE_6spディレクトリでbs_lrp.pyを作成し、実行したが、謎のエラーが出て実行できなかった。
おそらくlnL後に空白が多く含まれている事が原因か?
### /home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bs_null/result/OG0008385_maffted_fixed_branch_altの一部
TREE # 1: (1, ((2, 3), 4), (5, 6)); MP score: -1
lnL(ntime: 9 np: 13): -5213.435205 +0.000000
7..1 7..8 8..9 9..2 9..3 8..4 7..10 10..5 10..6
1.563575 0.056729 0.383439 0.872471 2.465303 1.081219 0.906124 1.759343 4.388369 2.018860 0.736067 0.164952 0.101980
Note: Branch length is defined as number of nucleotide substitutions per codon (not per neucleotide site).
tree length = 13.47657これを修正したファイルnew_bs_lrp.pyを作成し、実行した。
### new_lrp.pyの中身
import os
import re
from scipy.stats import chi2
def parse_lnL(file_path):
try:
with open(file_path, 'r') as f:
for line in f:
print(f"Processing line: {line.strip()}") # デバッグ用
match = re.search(r'lnL\(ntime:\s*\d+\s+np:\s*(\d+)\):\s+(-?\d+\.\d+)', line)
if match:
np = int(match.group(1))
lnL = float(match.group(2))
return np, lnL
print(f"{file_path} に 'lnL' 行が見つかりませんでした。形式を確認してください。")
return None, None
except Exception as e:
print(f"{file_path} を開く際にエラーが発生しました: {e}")
return None, None
def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
try:
lr_stat = 2 * (alt_lnL - null_lnL)
df = alt_np - null_np
p_val = chi2.sf(lr_stat, df)
return p_val
except Exception as e:
print(f"LRT計算中にエラーが発生しました: {e}")
return None
def main():
alt_dir = '/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bsA/result'
null_dir = '/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/bs_null/result'
output_file = 'branch_site_lrt_results.txt'
alt_dir = os.path.expanduser(alt_dir)
null_dir = os.path.expanduser(null_dir)
# 処理するOGファイルリストの取得
og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_branch_alt' in f]
with open(output_file, 'w') as out_f:
out_f.write('OG_num\tp_val\tpositive_selection\n')
# 各OGファイルについてループ処理
for idx, og_file in enumerate(og_files):
og_num = og_file.split('_')[0]
alt_file = os.path.join(alt_dir, og_file)
null_file = os.path.join(null_dir, og_file)
print(f"{idx+1}/{len(og_files)}: {og_num} の解析を開始します...")
if os.path.exists(null_file):
alt_np, alt_lnL = parse_lnL(alt_file)
null_np, null_lnL = parse_lnL(null_file)
if alt_np is not None and null_np is not None:
p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
if p_val is not None:
reject_null = '+' if p_val < 0.05 else '-'
out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')
print(f"{og_num} の解析が完了しました。p値: {p_val}, 正の選択: {reject_null}")
else:
print(f"{og_num} のLRT計算に失敗しました。")
else:
print(f"{og_num} のlnLデータが不完全です。")
else:
print(f"{og_num} の対応するnullモデルファイルが見つかりませんでした。")
if __name__ == "__main__":
main()これを実行したところ、branch_site_lrt_results.txtに結果がちゃんと出力された。
これをローカルにコピー。
:~/bio/for_paml/241010$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_paml/241009_IQTREE_6sp/branch_site_lrt_results.txt /Users/kosukesano/bio/for_paml/241010/IQTREE_branch_site_lrt_results.txt
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
branch_site_lrt_results.txt 100% 27KB 967.8KB/s 00:00
:~/bio/for_paml/241010$ FDR後の結果は以下の通り
IQT=read.csv("/Users/kosukesano/bio/for_paml/241010/hosei_IQTREE_branch_site_lrt_results.txt", sep="\t")|>
dplyr::filter(significant == "True")
orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"
# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim(orthogroups_file, header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")
IQT_2=dplyr::left_join(IQT, orthogroups, by = c(OG_num = "V1"))|>
rename(gene_ID = V5)|>
dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", ""))
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
IQT_3=dplyr::left_join(IQT_2, fa, by = c(gene_ID = "Madara"))
deg1=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")
deg2=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_Adult_vs_Larva_DESeq2.csv", sep=",")
deg_all=dplyr::full_join(deg1, deg2, by = "gene_ID")
IQT_4=dplyr::left_join(IQT_3, deg_all, by = "gene_ID")|>###完成系
dplyr::select(gene_ID, q_val, Sory_GeneFunction, ovary.body_log2FC, ovary.body_adjPval, adult.llarva_log2FC, adult.llarva_adjPval, adult.mlarva_log2FC, adult.mlarva_adjPval) |>
print() gene_ID q_val
1 g12267.t1 2.458483e-02
2 g9945.t1 2.144997e-14
3 g10111.t1 1.777694e-04
4 g10787.t1 7.414778e-03
5 g7878.t1 1.201570e-12
6 g4328.t1 6.210582e-13
7 g1127.t1 6.411997e-03
8 g6829.t1 6.526610e-03
Sory_GeneFunction
1 D-glucuronyl C5-epimerase B
2 laminin subunit alpha
3 protein cueball
4 probable tRNA N6-adenosine threonylcarbamoyltransferase, mitochondrial
5 uncharacterized protein LOC115876326
6 cullin-5
7 ruvB-like helicase 1
8 methyltransferase-like protein 17, mitochondrial
ovary.body_log2FC ovary.body_adjPval adult.llarva_log2FC adult.llarva_adjPval
1 NA NA NA NA
2 -1.616277 9.89e-06 NA NA
3 NA NA NA NA
4 2.542604 3.48e-17 NA NA
5 -6.713844 1.54e-42 NA NA
6 1.180746 8.71e-09 NA NA
7 2.001888 1.17e-28 NA NA
8 1.806242 4.66e-14 NA NA
adult.mlarva_log2FC adult.mlarva_adjPval
1 NA NA
2 NA NA
3 NA NA
4 NA NA
5 NA NA
6 NA NA
7 NA NA
8 NA NA
PAML結果のまとめ
AST_join=AST|>
dplyr::mutate(ASTRAL = stringr::str_replace(positive_selection, "\\+", "ASTRAL"))
IQT_join=IQT|>
dplyr::mutate(IQTREE = stringr::str_replace(positive_selection, "\\+", "IQ-TREE"))
df=dplyr::full_join(IQT_join, AST_join, by = "OG_num")
orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim(orthogroups_file, header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")
df2=dplyr::left_join(df, orthogroups, by = c(OG_num = "V1"))|>
rename(gene_ID = V5)|>
dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", ""))
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
df3=dplyr::left_join(df2, fa, by = c(gene_ID = "Madara"))
deg1=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_ovary_vs_body_DESeq2.csv", sep=",")
deg2=read.csv("/Users/kosukesano/bio/for_cafe/Deg/DEG_Adult_vs_Larva_DESeq2.csv", sep=",")
deg_all=dplyr::full_join(deg1, deg2, by = "gene_ID")
df4=dplyr::left_join(df3, deg_all, by = "gene_ID")|>###完成系
tidyr::unite(tree_tool, ASTRAL, IQTREE, sep = "/") |>
dplyr::select(gene_ID, tree_tool, Sory_GeneFunction, ovary.body_log2FC, ovary.body_adjPval, adult.llarva_log2FC, adult.llarva_adjPval, adult.mlarva_log2FC, adult.mlarva_adjPval) |>
dplyr::mutate(tree_tool = stringr::str_replace(tree_tool, "\\/NA", ""))|>
dplyr::mutate(tree_tool = stringr::str_replace(tree_tool, "NA\\/", ""))|>
print() gene_ID tree_tool
1 g12267.t1 ASTRAL/IQ-TREE
2 g9945.t1 ASTRAL/IQ-TREE
3 g10111.t1 IQ-TREE
4 g10787.t1 ASTRAL/IQ-TREE
5 g7878.t1 IQ-TREE
6 g4328.t1 IQ-TREE
7 g1127.t1 IQ-TREE
8 g6829.t1 ASTRAL/IQ-TREE
9 g2313.t1 ASTRAL
10 g3206.t1 ASTRAL
11 g4236.t1 ASTRAL
12 g8864.t1 ASTRAL
Sory_GeneFunction
1 D-glucuronyl C5-epimerase B
2 laminin subunit alpha
3 protein cueball
4 probable tRNA N6-adenosine threonylcarbamoyltransferase, mitochondrial
5 uncharacterized protein LOC115876326
6 cullin-5
7 ruvB-like helicase 1
8 methyltransferase-like protein 17, mitochondrial
9 protein PTCD3 homolog, mitochondrial
10 LOW QUALITY PROTEIN: cell division cycle and apoptosis regulator protein 1-like
11 acyl-CoA dehydrogenase family member 9, mitochondrial
12 importin subunit alpha-3
ovary.body_log2FC ovary.body_adjPval adult.llarva_log2FC
1 NA NA NA
2 -1.6162768 9.89e-06 NA
3 NA NA NA
4 2.5426042 3.48e-17 NA
5 -6.7138444 1.54e-42 NA
6 1.1807457 8.71e-09 NA
7 2.0018883 1.17e-28 NA
8 1.8062424 4.66e-14 NA
9 0.8232494 1.43e-13 NA
10 NA NA NA
11 NA NA NA
12 NA NA NA
adult.llarva_adjPval adult.mlarva_log2FC adult.mlarva_adjPval
1 NA NA NA
2 NA NA NA
3 NA NA NA
4 NA NA NA
5 NA NA NA
6 NA NA NA
7 NA NA NA
8 NA NA NA
9 NA NA NA
10 NA NA NA
11 NA NA NA
12 NA NA NA
合計で12個の遺伝子に正の選択が見られた
1011
scorpion環境でのBRAKER3インストール
以下のコマンドを実行
conda install -c anaconda perl
conda install -c anaconda biopython
conda install -c bioconda perl-app-cpanminus
conda install -c bioconda perl-file-spec
conda install -c bioconda perl-hash-merge
conda install -c bioconda perl-list-util
conda install -c bioconda perl-module-load-conditional
conda install -c bioconda perl-posix
conda install -c bioconda perl-file-homedir
conda install -c bioconda perl-parallel-forkmanager
conda install -c bioconda perl-scalar-util-numeric
conda install -c bioconda perl-yaml
conda install -c bioconda perl-exception-class
conda install -c bioconda perl-class-data-inheritable
conda install -c bioconda perl-test-pod
conda install -c bioconda perl-file-which
conda install -c bioconda perl-mce
conda install -c bioconda perl-threaded
conda install -c bioconda perl-list-util
conda install -c bioconda perl-math-utils
conda install -c bioconda cdbtools
conda install -c eumetsat perl-yaml-xs
conda install -c bioconda perl-data-dumperconda install anaconda::gcc_linux-64perlモジュールのインストール
cpanm Hash::Merge
cpanm List::Util
cpanm MCE::Mutex
cpanm Module::Load::Conditional
cpanm Parallel::Forkcpanm
cpanm Scalar::Util::Numeric
cpanm YAML
cpanm Math::Utils
cpanm File::HomeDir
cpanm Thread::Queue(braker) dendezia@scorpion:~/tool$ cpanm File::Spec::Functions
--> Working on File::Spec::Functions
Fetching http://www.cpan.org/authors/id/X/XS/XSAWYERX/PathTools-3.75.tar.gz ... OK
Configuring PathTools-3.75 ... OK
Building and testing PathTools-3.75 ... FAIL
! Installing File::Spec::Functions failed. See /home/dendezia/.cpanm/work/1728622512.1157695/build.log for details. Retry with --force to force install it.
(braker) dendezia@scorpion:~/tool$ cpanm YAML::XS
--> Working on YAML::XS
Fetching http://www.cpan.org/authors/id/T/TI/TINITA/YAML-LibYAML-v0.902.0.tar.gz ... OK
Configuring YAML-LibYAML-v0.902.0 ... OK
Building and testing YAML-LibYAML-v0.902.0 ... FAIL
! Installing YAML::XS failed. See /home/dendezia/.cpanm/work/1728622576.1157970/build.log for details. Retry with --force to force install it.
(braker) dendezia@scorpion:~/tool$ cpanm Data::Dumper
--> Working on Data::Dumper
Fetching http://www.cpan.org/authors/id/N/NW/NWCLARK/Data-Dumper-2.183.tar.gz ... OK
Configuring Data-Dumper-2.183 ... OK
Building and testing Data-Dumper-2.183 ... FAIL
! Installing Data::Dumper failed. See /home/dendezia/.cpanm/work/1728622586.1158296/build.log for details. Retry with --force to force install it.
(braker) dendezia@scorpion:~/tool$ cpanm threads
--> Working on threads
Fetching http://www.cpan.org/authors/id/J/JD/JDHEDDEN/threads-2.21.tar.gz ... OK
Configuring threads-2.21 ... N/A
! Configure failed for threads-2.21. See /home/dendezia/.cpanm/work/1728622605.1158565/build.log for details.
(braker) dendezia@scorpion:~/tool$ 4つのモジュールでエラー。前の記録を見ると同じエラーが出てるけど放置してるっぽい。
BRAKER本体やその他ツールのインストール
(braker) dendezia@scorpion:~/tool/braker_git_install$ git clone https://github.com/Gaius-Augustus/BRAKER.git
Cloning into 'BRAKER'...
remote: Enumerating objects: 7335, done.
remote: Counting objects: 100% (1677/1677), done.
remote: Compressing objects: 100% (667/667), done.
remote: Total 7335 (delta 1079), reused 1539 (delta 987), pack-reused 5658 (from 1)
Receiving objects: 100% (7335/7335), 123.45 MiB | 24.97 MiB/s, done.
Resolving deltas: 100% (5430/5430), done.
(braker) dendezia@scorpion:~/tool/braker_git_install$ ls
BRAKER
(braker) dendezia@scorpion:~/tool/braker_git_install$
(braker) dendezia@scorpion:~/tool/braker_git_install$ git clone https://github.com/gatech-genemark/ProtHint.git
Cloning into 'ProtHint'...
remote: Enumerating objects: 1289, done.
remote: Counting objects: 100% (257/257), done.
remote: Compressing objects: 100% (91/91), done.
remote: Total 1289 (delta 170), reused 249 (delta 166), pack-reused 1032 (from 1)
Receiving objects: 100% (1289/1289), 56.69 MiB | 15.88 MiB/s, done.
Resolving deltas: 100% (812/812), done.
(braker) dendezia@scorpion:~/tool/braker_git_install$ git clone https://github.com/Gaius-Augustus/TSEBRA.git
Cloning into 'TSEBRA'...
remote: Enumerating objects: 1443, done.
remote: Counting objects: 100% (293/293), done.
remote: Compressing objects: 100% (147/147), done.
remote: Total 1443 (delta 179), reused 237 (delta 143), pack-reused 1150 (from 1)
Receiving objects: 100% (1443/1443), 59.02 MiB | 20.51 MiB/s, done.
Resolving deltas: 100% (912/912), done.
(braker) dendezia@scorpion:~/tool/braker_git_install$ git clone https://github.com/gatech-genemark/GeneMark-ETP.git
Cloning into 'GeneMark-ETP'...
remote: Enumerating objects: 482, done.
remote: Counting objects: 100% (46/46), done.
remote: Compressing objects: 100% (37/37), done.
remote: Total 482 (delta 11), reused 31 (delta 7), pack-reused 436 (from 1)
Receiving objects: 100% (482/482), 56.91 MiB | 21.75 MiB/s, done.
Resolving deltas: 100% (230/230), done.
Updating files: 100% (249/249), done.遺伝研での環境は各ツール別にディレクトリを作っていたが、今回はbraker_git_installに全て集約することにした。
プロテインデータベースのダウンロード
ローカルで実行した。
:~/Downloads$ scp ~/Downloads/Arthropoda.fa.gz dendezia@scorpion:/home/dendezia/tool/braker_git_install
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
| .+. .=o=+.|
| o*.o.=.*+|
| oo.*oo B.o|
| ..o= +.* ..|
| o .+S o * . |
| . o. . E |
| ....o |
| oo+ |
| o= |
+----[SHA256]-----+
Arthropoda.fa.gz 100% 1219MB 99.7MB/s 00:12
:~/Downloads$ これをscorpionで解凍
(braker) dendezia@scorpion:~/tool/braker_git_install$ gunzip Arthropoda.fa.gz
(braker) dendezia@scorpion:~/tool/braker_git_install$ ls
Arthropoda.fa BRAKER GeneMark-ETP ProtHint TSEBRA遺伝研スパコンでのEkamのBRAKER実行
:~/Downloads$ scp dendezia@scorpion:/home/dendezia/tool/for_softmask/nama_data/Ekam_dataset/data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz /Users/kosukesano/bio/
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
| .+. .=o=+.|
| o*.o.=.*+|
| oo.*oo B.o|
| ..o= +.* ..|
| o .+S o * . |
| . o. . E |
| ....o |
| oo+ |
| o= |
+----[SHA256]-----+
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz
:~/Downloads$ scp /Users/kosukesano/bio/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_braker/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz 100% 92MB 102.2MB/s 00:00
:~/Downloads$ scorpionでソフトマスクしたEkamのゲノムデータを遺伝研に転送
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta Madara_RNAseq busco_downloads femo_busco.sh.pe26221930 kohuki_busco.sh.o26238968 length.txt
BUSCO_OUTPUT_FEMO_GENOME Sfem_RNAseq femo_busco.sh femo_busco.sh.po26221930 kohuki_busco.sh.pe26238968 madaralength.txt
BUSCO_OUTPUT_KOHUKI_GENOME Sfem_pilon_softmasked.fasta femo_busco.sh.e26221930 kohuki_busco.sh kohuki_busco.sh.po26238968
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz Sfem_softmasked.fasta femo_busco.sh.o26221930 kohuki_busco.sh.e26238968 kohuki_softmasked.fasta
kosukesano@at139:~/tools/for_braker/nama_data$ unzip GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz
Archive: GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz
inflating: GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta Madara_RNAseq femo_busco.sh kohuki_busco.sh kohuki_softmasked.fasta
BUSCO_OUTPUT_FEMO_GENOME Sfem_RNAseq femo_busco.sh.e26221930 kohuki_busco.sh.e26238968 length.txt
BUSCO_OUTPUT_KOHUKI_GENOME Sfem_pilon_softmasked.fasta femo_busco.sh.o26221930 kohuki_busco.sh.o26238968 madaralength.txt
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked Sfem_softmasked.fasta femo_busco.sh.pe26221930 kohuki_busco.sh.pe26238968
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz busco_downloads femo_busco.sh.po26221930 kohuki_busco.sh.po26238968
kosukesano@at139:~/tools/for_braker/nama_data$ less GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked
kosukesano@at139:~/tools/for_braker/nama_data$ mv GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked Elaeidobius_kamerunicus.masked.fna
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz Sfem_softmasked.fasta femo_busco.sh.o26221930 kohuki_busco.sh.e26238968 kohuki_softmasked.fasta
BUSCO_OUTPUT_FEMO_GENOME Madara_RNAseq busco_downloads femo_busco.sh.pe26221930 kohuki_busco.sh.o26238968 length.txt
BUSCO_OUTPUT_KOHUKI_GENOME Sfem_RNAseq femo_busco.sh femo_busco.sh.po26221930 kohuki_busco.sh.pe26238968 madaralength.txt
Elaeidobius_kamerunicus.masked.fna Sfem_pilon_softmasked.fasta femo_busco.sh.e26221930 kohuki_busco.sh kohuki_busco.sh.po26238968
kosukesano@at139:~/tools/for_braker/nama_data$ Elaeidobius_kamerunicus.masked.fnaにファイル名を変更
~/tools/for_braker/Ekamディレクトリを作成し、BRAKERを実行した。投げたスクリプトは以下の通り。
### Ekam_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Elaeidobius_kamerunicus.masked.fna\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--threads=16\
--species=Smadaranus_withRNA\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
date1013
EkamのBRAKER続き
なんか知らんエラーがでとる!
### Ekam_braker.sh.e27018010の中身
#**********************************************************************************
# BRAKER CONFIGURATION
#**********************************************************************************
# BRAKER CALL: /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Elaeidobius_kamerunicus.masked.fna --prot_seq=/home/kosukesano/tools/Arthropoda.fa --threads=16 --species=Ekamerunicus --AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config --AUGUSTUS_BIN_PATH=/usr/bin --AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts --GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin --PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin --TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
# Sun Oct 13 16:19:40 2024: braker.pl version 3.0.8
# Sun Oct 13 16:19:40 2024: Only Protein input detected, BRAKER will be executed in EP mode (BRAKER2).
# Sun Oct 13 16:19:40 2024: Configuring of BRAKER for using external tools...
# Sun Oct 13 16:19:40 2024: Trying to set $AUGUSTUS_CONFIG_PATH...
# Sun Oct 13 16:19:40 2024: Found command line argument $AUGUSTUS_CONFIG_PATH.
# Sun Oct 13 16:19:40 2024: Checking /usr/share/augustus/config as potential path for $AUGUSTUS_CONFIG_PATH.
# Sun Oct 13 16:19:40 2024: Success! Setting $AUGUSTUS_CONFIG_PATH to /usr/share/augustus/config!
# Sun Oct 13 16:19:40 2024: WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1933
AUGUSTUS_CONFIG_PATH/species (in this case /usr/share/augustus/config/species) is not writeable. BRAKER will try to copy the AUGUSTUS config directory to a writeable location.
#*********
# WARNING: Detected whitespace in fasta header of file /home/kosukesano/tools/for_braker/nama_data/Elaeidobius_kamerunicus.masked.fna. This may later on cause problems! The pipeline will create a new file without spaces or "|" characters and a genome_header.map file to look up the old and new headers. This message will be suppressed from now on!
#*********
ERROR in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 5258
Failed to execute: /home/kosukesano/.pyenv/versions/anaconda3-2020.11/envs/braker/bin/perl /home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin/gmes/gmes_petap.pl --verbose --cores=16 --ES --gc_donor 0.001 --sequence=/lustre7/home/kosukesano/tools/for_braker/Ekam/braker/genome.fa --soft_mask auto 1>/lustre7/home/kosukesano/tools/for_braker/Ekam/braker/GeneMark-ES.stdout 2>/lustre7/home/kosukesano/tools/for_braker/Ekam/braker/errors/GeneMark-ES.stderr !GeneMark-ESでエラーが出てるみたいなので、braker/errors/GeneMark-ES.stderrを確認する。
### GeneMark-ES.stderrの中身
error, file not found: data/training.fnaこんな感じ。
これをググってみたらこちらのページがヒットした。どうもインプットのゲノムがマスクされすぎていて、BRAKERが配列を探せなかったらしい?
そもそも元のデータがすでにソフトマスクされていないか?
### ~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fnaの中身の一部
>JACGEL010000001.1 Elaeidobius kamerunicus isolate PL Ekam 1 scaffold-1, whole genome shotgun sequence
taaaacaataaaaatactttattttaatattcaatattgtattaatatataacttaattttctctattttaactaatttt
caaACCCCTAACATGTTTTCCAGTGAGccgctaaaaaaatatcacaaaatgaactttaagtttaagttGGAAATTTAAGA
CTTGAAGCTAGCTAGGATGAGTCGNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNNAAATACATCTTTGATTTG
TAAGTATctgtagtatatttttggaataaaatagtttattaaatatatttcggtTTTCCTTTTCCCGTAGGACGTTGCAA
AGTGGCGACgaggatttttatatttccctaGAAAAATAGAACCCCCTAGTTGGGAAAATTAGTGGGTTTCTAAAATTCCG
GTAAAGTAAGAAAACGTGTAGTGTAGTGTGCAGATAGAATTTGaccctaaaataaatgattggACTGTGCACATAAATCG
TCTGATGATTCTATAAACAGACCAAAAAGAGTAATTTTACTCAATGGGCTTGCTCAAGAACCGTATATATTGCTACAAAA
CTTAAGTCTTTCAACGAAACCTTCTGAAGCTACTTACTAGGACCTTCTCAAGTACTTTAATAGCTATTTAAAGTTTTCCG
ATTACAAGGACTTCGATGAGGTCGAGGTAGAGAAAACGCTGGCAACCGTGGAGGCCGAGTGCATTTTGGAGGCGGTACTA
GTGACCGGTCGGGCAGCGCAGGAACGAAAAAATCAAGTGATAGTGATGTGTTCAATCTGTCGAAAAAGTAAACATTCCga
aaacaaatgttttcatCGTAATTTTAACAGGTTTTGCAGCTTCTGCAAACTAAAGCACATAATACagtaaactgtaaaaa
taaaatggacattgaacaaaataccaataatGACAATgtgaatgatttaaattttaatataaatttaaatgaatttccgG
TCTATACCACATAACATTTCTAGTcctattgaaatattttttaaataatagtctGTATAATTTTGAACTGATTCAGGTGC
AGTACTTTCGTGTACACCCTATTCGAtgtatgcaaattatttttgagatattttctTGATTAAAACTTATGTAACATGAT
TAATTTGAGTGGTAGAATAATTTCACCAATTGGTCAAGTTGTCCTAAAGctggaatataataaacaagtttCGAATTTAA
CGCGTTGCAATATAAAAGAGAGTAATATACCCTTGCTGGGACGAGATTTCATTGCAGAATTTAAGCAATTATTTAGGGTG
taagcaaattaattatataactaaacatagttttgatattaaataaaataggatgtttgaataaaatttaattaaaatta
tttctgaagGAAAGTTATGTTCCAAAATTTATCCAGCCCAATTAGAGTTAGATAGGTTGGTTGTTTCTGGAATTATTACT
CCAGTTAAGCATTCAGACTTGGGAACACCAATTGTCCATGTTCTAAAAGAAGATGGCTCTGTTCGCATTTGTGGTGACga
gaaaataacattaaatccatttttatagaattatagaaatatagcCCTTCAGCAAATTACGAAATATTATATCAGTATAC
TTTGCCTTGCATAAATAATTCTGCACTCAGAATAGATGCACAatttatgttcaaaatatttaaaaaaatacaaaaaaatg
caagagAGAAAGACGGTTTTAAGGATATGCATTTAACtagtttttattagtattgttTTTGATCTAAATTTTAACGTAAA
CATACTCGTTAAGTAAGTTAATTttgggttttaaaaaagtaagccTTAAGGAAGaggttattgtattttagataaatatt
tctacagCGCAAGGATAAATTTAAGTTCCGTAAATATCAGTTCCCATTTATTACTGCCATTTTTAAACGCatttataatg
caaaaaaataataaggagaTTTTTCATTAACCTTCAACAGATTATTAGGTTTATCAGCAAATCGGGGACGATTTCATTaaこれこのままかけてよくね?
というわけでこの生データをBRAKERにかける
kosukesano@at139:~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1$ cp GCA_014849505.1_AAL_Ekam_1.0_genomic.fna ~/tools/for_braker/nama_data/
kosukesano@at139:~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1$ cd ~/tools/for_braker/nama_data/
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz busco_downloads femo_busco.sh.po26221930 kohuki_busco.sh.po26238968
BUSCO_OUTPUT_FEMO_GENOME Madara_RNAseq femo_busco.sh kohuki_busco.sh kohuki_softmasked.fasta
BUSCO_OUTPUT_KOHUKI_GENOME Sfem_RNAseq femo_busco.sh.e26221930 kohuki_busco.sh.e26238968 length.txt
Elaeidobius_kamerunicus.masked.fna Sfem_pilon_softmasked.fasta femo_busco.sh.o26221930 kohuki_busco.sh.o26238968 madaralength.txt
GCA_014849505.1_AAL_Ekam_1.0_genomic.fna Sfem_softmasked.fasta femo_busco.sh.pe26221930 kohuki_busco.sh.pe26238968
kosukesano@at139:~/tools/for_braker/nama_data$ mv GCA_014849505.1_AAL_Ekam_1.0_genomic.fna Ekam_NotUseEDTA.fna
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz busco_downloads femo_busco.sh.po26221930 kohuki_busco.sh.po26238968
BUSCO_OUTPUT_FEMO_GENOME Madara_RNAseq femo_busco.sh kohuki_busco.sh kohuki_softmasked.fasta
BUSCO_OUTPUT_KOHUKI_GENOME Sfem_RNAseq femo_busco.sh.e26221930 kohuki_busco.sh.e26238968 length.txt
Ekam_NotUseEDTA.fna Sfem_pilon_softmasked.fasta femo_busco.sh.o26221930 kohuki_busco.sh.o26238968 madaralength.txt
Elaeidobius_kamerunicus.masked.fna Sfem_softmasked.fasta femo_busco.sh.pe26221930 kohuki_busco.sh.pe26238968
kosukesano@at139:~/tools/for_braker/nama_data$ これでかけたけど同じエラーが出た。
じゃあ全部大文字にしてやんよ!
~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1ディレクトリ内で以下のコードをかいた。
### oomoji.py
def convert_lowercase_to_uppercase(input_file, output_file):
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
if line.startswith('>'):
# ヘッダー行はそのまま出力
outfile.write(line)
else:
# 塩基配列の小文字を大文字に置換
outfile.write(line.upper())
input_file = '/home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic.fna'
output_file = '/home/kosukesano/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1/GCA_014849505.1_AAL_Ekam_1.0_genomic_upper.fna'
convert_lowercase_to_uppercase(input_file, output_file)というわけでこれの出力でBRAKERをかける
kosukesano@at139:~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1$ cp GCA_014849505.1_AAL_Ekam_1.0_genomic_upper.fna ~/tools/for_braker/nama_data/
kosukesano@at139:~/tools/for_softmask/nama_data/Ekam_data/GCA_014849505.1$ cd ~/tools/for_braker/nama_data/
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz Sfem_softmasked.fasta femo_busco.sh.pe26221930 kohuki_busco.sh.pe26238968
BUSCO_OUTPUT_FEMO_GENOME GCA_014849505.1_AAL_Ekam_1.0_genomic_upper.fna busco_downloads femo_busco.sh.po26221930 kohuki_busco.sh.po26238968
BUSCO_OUTPUT_KOHUKI_GENOME Madara_RNAseq femo_busco.sh kohuki_busco.sh kohuki_softmasked.fasta
Ekam_NotUseEDTA.fna Sfem_RNAseq femo_busco.sh.e26221930 kohuki_busco.sh.e26238968 length.txt
Elaeidobius_kamerunicus.masked.fna Sfem_pilon_softmasked.fasta femo_busco.sh.o26221930 kohuki_busco.sh.o26238968 madaralength.txt
kosukesano@at139:~/tools/for_braker/nama_data$ mv GCA_014849505.1_AAL_Ekam_1.0_genomic_upper.fna Ekam_oomoji.fna
kosukesano@at139:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta Elaeidobius_kamerunicus.masked.fna Sfem_softmasked.fasta femo_busco.sh.pe26221930 kohuki_busco.sh.pe26238968
BUSCO_OUTPUT_FEMO_GENOME GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz busco_downloads femo_busco.sh.po26221930 kohuki_busco.sh.po26238968
BUSCO_OUTPUT_KOHUKI_GENOME Madara_RNAseq femo_busco.sh kohuki_busco.sh kohuki_softmasked.fasta
Ekam_NotUseEDTA.fna Sfem_RNAseq femo_busco.sh.e26221930 kohuki_busco.sh.e26238968 length.txt
Ekam_oomoji.fna Sfem_pilon_softmasked.fasta femo_busco.sh.o26221930 kohuki_busco.sh.o26238968 madaralength.txt
kosukesano@at139:~/tools/for_braker/nama_data$ cd ../Ekam/
kosukesano@at139:~/tools/for_braker/Ekam$ ls
Ekam_braker.sh Ekam_braker.sh.e27018371 Ekam_braker.sh.o27018371 Ekam_braker.sh.pe27018371 Ekam_braker.sh.po27018371
Ekam_braker.sh.e27004779 Ekam_braker.sh.o27004779 Ekam_braker.sh.pe27004779 Ekam_braker.sh.po27004779 braker
Ekam_braker.sh.e27018010 Ekam_braker.sh.o27018010 Ekam_braker.sh.pe27018010 Ekam_braker.sh.po27018010 gputest
kosukesano@at139:~/tools/for_braker/Ekam$ rm -r braker/
kosukesano@at139:~/tools/for_braker/Ekam$ nano Ekam_braker.sh
kosukesano@at139:~/tools/for_braker/Ekam$ qsub Ekam_braker.shこれでも同じエラーが出てるんだけど ……。
デバッグのため、もう一度マダラのゲノムにbrakerをかける。これで同じエラーが出るならbrakerの環境が悪い。
~/tools/for_braker/241013_for_debag_madaraディレクトリを用意し、madara_braker.shをコピーしてqsubで実行。
Pstrのソフトマスク、マスキングの復元
Ekamに倣ってマスクを戻す。
### oomoji.py
def convert_lowercase_to_uppercase(input_file, output_file):
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
if line.startswith('>'):
# ヘッダー行はそのまま出力
outfile.write(line)
else:
# 塩基配列の小文字を大文字に置換
outfile.write(line.upper())
input_file = "/home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/GCA_016904865.1_GSC_weevil_1.0_genomic.fna"
output_file = "/home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/Pstr_oomoji.fna"
convert_lowercase_to_uppercase(input_file, output_file)大文字に戻して、これを自分でソフトマスクする。それ用のディレクトリを作成。
kosukesano@at139:~/tools/for_softmask$ mkdir Pstr_oomoji_softmaskデータベース作成
(EDTA2) kosukesano@at137:~/tools/for_softmask/Pstr_oomoji_softmask$ BuildDatabase -name Pstr_BLAST_DATABASE_PREFIX /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/Pstr_oomoji.fna
Building database Pstr_BLAST_DATABASE_PREFIX:
Reading /home/kosukesano/tools/for_softmask/nama_data/Pstr_data/GCA_016904865.1/Pstr_oomoji.fna...
Number of sequences (bp) added to database: 84140 ( 2025024129 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Pstr_oomoji_softmask$ RepeatModeler
#$ -S /bin/bash
#$ -cwd
#$ -l intel
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Pstr_BLAST_DATABASE_PREFIX -pa 6
dateこれをqsubで投げた。コア数とかは調整してもいいかも。
1015
Pstrのソフトマスク
Pstrもすでにマスクされてるみたいなので必要あるかわからないけど一応やっとく。
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Pstr_BLAST_DATABASE_PREFIX -pa 6
datePstrのbraker実行
~/tools/for_braker/Pstrを作り、Pstr_braker.shを作成。
試しにローカルで実行してみる。
scorpionでのオジロソフトマスク
dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ source /home/dendezia/tool/pyenv_env/EDTA_profile
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ BuildDatabase -name Ojiro_BLAST_DATABASE ../nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa
Building database Ojiro_BLAST_DATABASE:
Reading ../nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa...
Number of sequences (bp) added to database: 328 ( 736756452 bp )
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ ls
Ojiro_BLAST_DATABASE.nhr Ojiro_BLAST_DATABASE.njs Ojiro_BLAST_DATABASE.nni Ojiro_BLAST_DATABASE.nsq
Ojiro_BLAST_DATABASE.nin Ojiro_BLAST_DATABASE.nnd Ojiro_BLAST_DATABASE.nog Ojiro_BLAST_DATABASE.translation
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ nano Ojiro_RepeatModeler.sh
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ nano Ojiro_RepeatModeler.sh
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ qsub Ojiro_RepeatModeler.sh
2021.scorpion
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ qstat
Job id Name User Time Use S Queue
---------------- ---------------- ---------------- -------- - -----
2021.scorpion Ojiro_RepeatMod* dendezia 0 R batch
(EDTA2) dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ 遺伝研でのオジロのソフトマスク
遺伝研でも入りそうなので、どっちが早いかわからないけどとりあえず入れてみる
オジロのゲノムデータを遺伝研に転送
:~/Downloads$ scp ~/Downloads/Release_241005-ojiro_hifiasm.tar.gz kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Release_241005-ojiro_hifiasm.tar.gz 100% 2342MB 90.8MB/s 00:25
:~/Downloads$ 遺伝研でそれを解凍
kosukesano@at139:~/tools/for_softmask/nama_data$ tar -xzf Release_241005-ojiro_hifiasm.tar.gz
kosukesano@at139:~/tools/for_softmask/nama_data$ ls
231117_madaragenome.fasta Madara_ProcessRepeats.sh.e26141043 Madara_busco.sh Madara_busco.sh.po26146490 Sfem_ProcessRepeats.sh.pe26141247 core.64019
231117_madaragenome.fasta.cat.gz Madara_ProcessRepeats.sh.e26141224 Madara_busco.sh.e26144664 Pstr_data Sfem_ProcessRepeats.sh.po26141154 core.65135
231117_madaragenome.fasta.fasta.out Madara_ProcessRepeats.sh.e26141230 Madara_busco.sh.e26144679 Pstr_ncbi_dataset.zip Sfem_ProcessRepeats.sh.po26141247 core.65374
231117_madaragenome.fasta.masked Madara_ProcessRepeats.sh.o26141043 Madara_busco.sh.e26146490 README.md Sfem_assembly.fasta core.65380
231117_madaragenome.fasta.out Madara_ProcessRepeats.sh.o26141224 Madara_busco.sh.o26144664 Release_241005-ojiro_hifiasm Sfem_assembly.fasta.cat.gz core.65483
231117_madaragenome.fasta.out.gff Madara_ProcessRepeats.sh.o26141230 Madara_busco.sh.o26144679 Release_241005-ojiro_hifiasm.tar.gz Sfem_assembly.fasta.masked core.65491
231117_madaragenome.fasta.tbl Madara_ProcessRepeats.sh.pe26141043 Madara_busco.sh.o26146490 Sfem_ProcessRepeats.sh Sfem_assembly.fasta.out md5sum.txt
BUSCO_OUTPUT_FEMO Madara_ProcessRepeats.sh.pe26141224 Madara_busco.sh.pe26144664 Sfem_ProcessRepeats.sh.e26141154 Sfem_assembly.fasta.out.gff
BUSCO_OUTPUT_MADARA Madara_ProcessRepeats.sh.pe26141230 Madara_busco.sh.pe26144679 Sfem_ProcessRepeats.sh.e26141247 Sfem_assembly.fasta.tbl
Ekam_data Madara_ProcessRepeats.sh.po26141043 Madara_busco.sh.pe26146490 Sfem_ProcessRepeats.sh.o26141154 Sfem_pilon
Ekam_ncbi_dataset.zip Madara_ProcessRepeats.sh.po26141224 Madara_busco.sh.po26144664 Sfem_ProcessRepeats.sh.o26141247 busco_1897137032.log
Madara_ProcessRepeats.sh Madara_ProcessRepeats.sh.po26141230 Madara_busco.sh.po26144679 Sfem_ProcessRepeats.sh.pe26141154 busco_downloads
kosukesano@at139:~/tools/for_softmask/nama_data$ ls Release_241005-ojiro_hifiasm
hifiasm.sh out.bp.hap2.p_ctg.gfa out.bp.p_ctg.lowQ.bed out.bp.p_utg.noseq.gfa out.hap1.p_ctg.fa out.p_ctg.fa
out.bp.hap1.p_ctg.gfa out.bp.hap2.p_ctg.lowQ.bed out.bp.p_ctg.noseq.gfa out.bp.r_utg.gfa out.hap1.p_ctg.fa.sort.fasta slurm-3615.out
out.bp.hap1.p_ctg.lowQ.bed out.bp.hap2.p_ctg.noseq.gfa out.bp.p_utg.gfa out.bp.r_utg.lowQ.bed out.hap2.p_ctg.fa stats.txt
out.bp.hap1.p_ctg.noseq.gfa out.bp.p_ctg.gfa out.bp.p_utg.lowQ.bed out.bp.r_utg.noseq.gfa out.hap2.p_ctg.fa.sort.fasta
kosukesano@at139:~/tools/for_softmask/nama_data$データベース作成
kosukesano@at137:~$ source ~/tools/pyenv_env/EDTA_profile
(EDTA2) kosukesano@at137:~$ cd tools/for_softmask/Ojiro_softmask/
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ojiro_softmask$ ls
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ojiro_softmask$ BuildDatabase -name Ojiro_BLAST_DATABASE ../nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa
Building database Ojiro_BLAST_DATABASE:
Reading ../nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa...
Number of sequences (bp) added to database: 328 ( 736756452 bp )
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ojiro_softmask$ ls
Ojiro_BLAST_DATABASE.nhr Ojiro_BLAST_DATABASE.njs Ojiro_BLAST_DATABASE.nni Ojiro_BLAST_DATABASE.nsq
Ojiro_BLAST_DATABASE.nin Ojiro_BLAST_DATABASE.nnd Ojiro_BLAST_DATABASE.nog Ojiro_BLAST_DATABASE.translation
(EDTA2) kosukesano@at137:~/tools/for_softmask/Ojiro_softmask$ ASTRAL系統樹を用いたbranchモデルのcodeml
b_free: #1を振った特定の枝でdN/dSが異なるというモデルb_neut: 全ての枝でdN/dSが1であるというモデルM0: 全ての枝でdN/dSが一定であるというモデル
この3つのモデルを以下の方法で比較する。
b_freevsb_neut: #1を振った特定の枝でdN/dSが1と異なるか。 BackgroundはdN/dS=1で、ForegroundはdN/dS≠1。b_freevsM0: #1を振った特定の枝でdN/dSが他と異なるか。Backgroundは0\<dN/dS\<1で、ForegroundはdN/dS≠1。
このうち、branch-siteでは見れないdN/dS<1が見れると良いな
b_free
~/tools/for_paml/241009_ASTRAL_6sp/b_freeディレクトリを作成。その下で以下の2つのスクリプトを書いた。
(EDTA2) kosukesano@at137:~/tools/for_paml/241009_ASTRAL_6sp/b_free$ ls
b_free_ASTRAL_paml.sh template.ctl
(EDTA2) kosukesano@at137:~/tools/for_paml/241009_ASTRAL_6sp/b_free$ ### b_free_ASTRAL_paml.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/b_free"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_b_free"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### template.ctl
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>
model = 2 * 記号の有無で異なる ω を推定
NSsites = 0 * サイト間では ω は一定
fix_omega = 0 * ω の値を配列から推定
omega = 1 * 推定は ω=1 からスタート
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0これをqsubで投げた。
b_neut
### b_neut_ASTRAL_paml.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/b_neut"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_b_neut"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### template.ctl
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>
model = 2 * 記号の有無で異なる ω を推定
NSsites = 0 * サイト間では ω は一定
fix_omega = 1 * ω の値を固定
omega = 1 * 推定は ω=1 からスタート
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0M0
### M0_ASTRAL_paml.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/CDS_SCO"
bsA_dir="/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/M0"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_m_zero"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### template.sh
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/ASTRAL_6sp/data/new_tree_ASTRAL_ultrametric.nwk
outfile = <OUTFILE>
model = 0
NSsites = 0 * サイト間では ω は一定
fix_omega = 0 * ω の値を配列から推定
omega = 1 * 推定は ω=1 からスタート
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0IQTREE系統樹を用いたbranchモデルのcodeml
ASTRALのと同じことをした。
尤度比検定
### free_vs_neut_lrp.py
import os
import re
from scipy.stats import chi2
def parse_lnL(file_path):
try:
with open(file_path, 'r') as f:
for line in f:
print(f"Processing line: {line.strip()}") # デバッグ用
match = re.search(r'lnL\(ntime:\s*\d+\s+np:\s*(\d+)\):\s+(-?\d+\.\d+)', line)
if match:
np = int(match.group(1))
lnL = float(match.group(2))
return np, lnL
print(f"{file_path} に 'lnL' 行が見つかりませんでした。形式を確認してください。")
return None, None
except Exception as e:
print(f"{file_path} を開く際にエラーが発生しました: {e}")
return None, None
def parse_w_ratios(file_path):
try:
with open(file_path, 'r') as f:
content = f.read()
match = re.search(r'Smad #(\d+\.\d+)', content)
if match:
smad_w_ratio = float(match.group(1))
return smad_w_ratio
print(f"{file_path} に 'Smad' の w ratio が見つかりませんでした。")
return None
except Exception as e:
print(f"{file_path} の w ratio 抽出中にエラーが発生しました: {e}")
return None
def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
try:
lr_stat = 2 * (alt_lnL - null_lnL)
df = alt_np - null_np
p_val = chi2.sf(lr_stat, df)
return p_val
except Exception as e:
print(f"LRT計算中にエラーが発生しました: {e}")
return None
def main():
alt_dir = '/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/b_free/result'
null_dir = '/home/kosukesano/tools/for_paml/241009_ASTRAL_6sp/b_neut/result'
output_file = 'ASTRAL_free_vs_neut_lrt_results_with_w_ratios.txt'
alt_dir = os.path.expanduser(alt_dir)
null_dir = os.path.expanduser(null_dir)
# 処理するOGファイルリストの取得
og_files = [f for f in os.listdir(alt_dir) if '_maffted_fixed_b_free' in f]
with open(output_file, 'w') as out_f:
out_f.write('OG_num\tp_val\tpositive_selection\tSmad_w_ratio\n')
# 各OGファイルについてループ処理
for idx, og_file in enumerate(og_files):
og_num = og_file.split('_')[0]
alt_file = os.path.join(alt_dir, og_file)
null_file = os.path.join(null_dir, og_file.replace('_maffted_fixed_b_free', '_maffted_fixed_b_neut'))
print(f"{idx+1}/{len(og_files)}: {og_num} の解析を開始します...")
if os.path.exists(null_file):
alt_np, alt_lnL = parse_lnL(alt_file)
null_np, null_lnL = parse_lnL(null_file)
smad_w_ratio = parse_w_ratios(alt_file)
if alt_np is not None and null_np is not None:
p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
if p_val is not None:
reject_null = '+' if p_val < 0.05 else '-'
out_f.write(f'{og_num}\t{p_val}\t{reject_null}\t{smad_w_ratio}\n')
print(f"{og_num} の解析が完了しました。p値: {p_val}, 正の選択: {reject_null}, Smadのw ratio: {smad_w_ratio}")
else:
print(f"{og_num} のLRT計算に失敗しました。")
else:
print(f"{og_num} のlnLデータが不完全です。")
else:
print(f"{og_num} の対応するnullモデルファイルが見つかりませんでした。")
if __name__ == "__main__":
main()これを実行した。
1017
遺伝研環境でのオジロのソフトマスク続き
RepeatModelerが終わったので、続いてRepeatMaskerをかける。
以下のスクリプトを書いてqsubで投げた。
### Ojiro_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source /home/kosukesano/tools/pyenv_env/EDTA_profile
RepeatMasker -pa 6 -lib\
/home/kosukesano/tools/for_softmask/Ojiro_softmask/RM_3181478.TueOct151949192024/consensi.fa.classified\
/home/kosukesano/tools/for_softmask/nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa
datescorpion環境でのオジロのソフトマスク続き
遺伝研環境と同様、RepeatModelerが終わったので、RepeatMaskerをかける。
以下のスクリプトを書いてqsubで投げた。
### scorpionでのOjiro_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
cd /home/dendezia/tool/for_softmask/Ojiro_softmask/
source /home/dendezia/tool/pyenv_env/EDTA_profile
RepeatMasker -pa 6 -lib\
/home/dendezia/tool/for_softmask/Ojiro_softmask/RM_1996100.TueOct150214432024/consensi.fa.classified\
-xsmall\
/home/dendezia/tool/for_softmask/nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa
date遺伝研との変更点として、-xmallというオプションをつけてみた。Metalさんのサイトだと「反復配列を小文字にするsoft mask。 デフォルトでは N に置き換えるhard mask。」と説明されている。これを使えばわざわざProcessRepeatsのスクリプトを書かなくても良くなる?
これについて、遺伝研のがqwの間にオジロのRepeatMaskerが終わってた。 しかもソフトマスクまで終了している!
dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ ls ../nama_data/
Ekam_dataset Release_241005-ojiro_hifiasm Release_241005-ojiro_hifiasm.tar.gz
dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$ ls ../nama_data/Release_241005-ojiro_hifiasm
hifiasm.sh out.bp.hap2.p_ctg.gfa out.bp.p_ctg.lowQ.bed out.bp.p_utg.noseq.gfa out.hap1.p_ctg.fa out.p_ctg.fa out.p_ctg.fa.tbl
out.bp.hap1.p_ctg.gfa out.bp.hap2.p_ctg.lowQ.bed out.bp.p_ctg.noseq.gfa out.bp.r_utg.gfa out.hap1.p_ctg.fa.sort.fasta out.p_ctg.fa.cat.gz slurm-3615.out
out.bp.hap1.p_ctg.lowQ.bed out.bp.hap2.p_ctg.noseq.gfa out.bp.p_utg.gfa out.bp.r_utg.lowQ.bed out.hap2.p_ctg.fa out.p_ctg.fa.masked stats.txt
out.bp.hap1.p_ctg.noseq.gfa out.bp.p_ctg.gfa out.bp.p_utg.lowQ.bed out.bp.r_utg.noseq.gfa out.hap2.p_ctg.fa.sort.fasta out.p_ctg.fa.out
dendezia@scorpion:~/tool/for_softmask/Ojiro_softmask$RepeatMaskerの直接の出力であるout.p_ctg.fa.cat.gzの他に、ProcessRepeatsの出力であるout.p_ctg.fa.maskedもいる!
### out.p_ctg.fa.maskedの中身の一部
>ptg000001l
TAGCAGTATCGAGTATAATCATAATATCGTAGTTTTATTGCTAAAACTGT
CCTTTCAACTAATAGTTAGGTATAGATATTCACATATGCATTTTCATTTT
TAAATAAATCTTCGATACTCTGTAATCAATTTCCATTTTTGTTCTATCCC
AAATTATATAAAGTATATAATTTTCTATGTTTTTTTGGTGGAGTGTTCGC
AAAGGGCTGTGACTTGAAGGATGCGTCTTAATCTCGAGGAATATAATGAA
GCAAATGTATCTGCATTAATCTTCTTCTATCTAGTGAGTTGAAATATAAT
GTGGGGTATTATAACAATGACGCAGTAGTAAATAAAAATAAATCAAATCG
ACTTACGTCGATATAAAGTATACTAATTAAAAACATAAAGTCAATCTCGC
AAAAGCAAATATAAGTTAATACATATTAGATATAAATTTGTCCAGATATA
TTAAAATGGCTATTAGTCATTTCTTGACACGGGAtaattaataattaatt
tttcattaaattaaCATACTAAGAAAAACCAGACATCAGACCCAGTTGGT
TTTTCAACTGAAGTGAAACAGTAATCTTAAGCAAATATATCAATAATCTA
ATATGAATTCCTACAAAATTATCTGCTTGAACCTAGAACAAGCTATGCCT
GCGTATATAACTTTAACCAGTTAAGTGACTTCATGCATATATTACTATGA
TTTTAACACCTAATTAGCCTAATGGCTTCTGCTTATGTTCAAAAGATTAC
ATCTAAGTCGATTTTCTTCTCATCGTCATAAGAGGATTAAAATATTCAAA
TTAATAATATCCAGAATGATCAATAAATTAACAAACGAAATTTTAAATTG
CCGTTGATCTAATGTGGTAAATGGGTATTATGTAATATTTTTCGACAGGG
GTGGTATGATCGAGTAATTCGTCAACTAGAAACTACAGTATATATTGTAT
CTGAGCTGAACGAAGTtacagggatatccatataaaagtaatgaatccta
ctttttattcttaaataaacgttatatataaaagttttggctattttgaa
acatttatatatcttacaaccaaaataattgtgcaacgaataatgaagta
gtataaaacatcgattttcttgcttacttaaattggacggtatggttttt
attgcatatttgtattctaGGAAAAATTATGAACGTTACATGTATTATGGちゃんとソフトマスクされてるっぽい。今後はこれでいいな。
オジロゲノムのBRAKER
マスキング後のオジロのゲノムは241017_Ojiro_masked.faとした。
まずオジロのゲノムをローカルに移動
:~/bio$ scp dendezia@scorpion:/home/dendezia/tool/for_softmask/nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa.masked ~/bio/241017_Ojiro_masked.fa
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
| .+. .=o=+.|
| o*.o.=.*+|
| oo.*oo B.o|
| ..o= +.* ..|
| o .+S o * . |
| . o. . E |
| ....o |
| oo+ |
| o= |
+----[SHA256]-----+
out.p_ctg.fa.masked 100% 717MB 111.5MB/s 00:06
:~/bio$ ls
240903_ASTRAL.tre DEG_Adult_vs_Larva_DESeq2.csv fastp.json
240903_ASTRAL_Optimal_tree.tre DEG_ovary_vs_body_DESeq2.csv femo_annotated
240910_ASTRAL.tre GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz for_blast_test
240910_ASTRAL_Optimal_tree.tre IQTREE_7sp.tre for_cafe
240912_ASTRAL.tre SRR11742112 for_eggnoc
240912_ASTRAL_Optimal_tree.tre SRR11742112_1.fastq for_paml
240917_6sp_withOneZero_ASTRAL.tre SRR11742112_2.fastq functional_annotation
240917_7sp_ASTRAL.tre SRR9665770 madara_annotated
240917_CO1.tre SRR9665770_1.fastq madara_braker.zip
240919_7sp SRR9665770_2.fastq memo.txt
241017_Ojiro_masked.fa SRR9665770_report1.html new_rbh.py
7sp.tre braker_t1_sequences.aa.zip qc_SRR9665770_1.fq
CAFE前準備.R drawtree.R qc_SRR9665770_2.fq
:~/bio$ 続いてこれを遺伝研環境に移動
:~/bio$ scp ~/bio/241017_Ojiro_masked.fa kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_braker/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
241017_Ojiro_masked.fa 100% 717MB 108.7MB/s 00:06
:~/bio$ また、オジロはRNA-seqのデータもあるので、それらをすべて遺伝研に転送する。遺伝研の方でkosukesano/tools/for_braker/nama_data/Ojiro_RNAseqディレクトリを作り、そこにすべて転送した。
:/Volumes/Elements_1/240529_RNAseq/RawData$ scp /Volumes/Elements_1/240529_RNAseq/RawData/ojiro_*/*.gz kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
ojiro_E1_1.fq.gz 100% 2176MB 107.7MB/s 00:20
ojiro_E1_2.fq.gz 100% 2254MB 106.0MB/s 00:21
ojiro_E2_1.fq.gz 100% 2379MB 110.1MB/s 00:21
ojiro_E2_2.fq.gz 100% 2464MB 109.7MB/s 00:22
ojiro_E3_1.fq.gz 100% 1858MB 107.4MB/s 00:17
ojiro_E3_2.fq.gz 100% 1892MB 95.7MB/s 00:19
ojiro_E4_1.fq.gz 100% 2184MB 88.2MB/s 00:24
ojiro_E4_2.fq.gz 100% 2213MB 91.0MB/s 00:24
ojiro_H1_1.fq.gz 100% 1510MB 87.7MB/s 00:17
ojiro_H1_2.fq.gz 100% 1558MB 108.8MB/s 00:14
ojiro_H2_1.fq.gz 100% 1860MB 110.6MB/s 00:16
ojiro_H2_2.fq.gz 100% 1909MB 108.5MB/s 00:17
ojiro_H3_1.fq.gz 100% 1620MB 107.8MB/s 00:15
ojiro_H3_2.fq.gz 100% 1669MB 109.6MB/s 00:15
ojiro_H4_1.fq.gz 100% 1846MB 108.7MB/s 00:16
ojiro_H4_2.fq.gz 100% 1889MB 108.6MB/s 00:17
ojiro_L1_1.fq.gz 100% 2072MB 110.0MB/s 00:18
ojiro_L1_2.fq.gz 100% 2149MB 109.0MB/s 00:19
ojiro_L2_1.fq.gz 100% 2166MB 108.3MB/s 00:19
ojiro_L2_2.fq.gz 100% 2200MB 96.0MB/s 00:22
ojiro_L3_1.fq.gz 100% 1838MB 90.3MB/s 00:20
ojiro_L3_2.fq.gz 100% 1894MB 95.4MB/s 00:19
ojiro_L4_1.fq.gz 100% 1868MB 107.0MB/s 00:17
ojiro_L4_2.fq.gz 100% 1939MB 95.4MB/s 00:20
ojiro_O1_1.fq.gz 100% 1603MB 93.4MB/s 00:17
ojiro_O1_2.fq.gz 100% 1682MB 110.1MB/s 00:15
ojiro_O2_1.fq.gz 100% 1707MB 108.7MB/s 00:15
ojiro_O2_2.fq.gz 100% 1783MB 109.0MB/s 00:16
ojiro_O3_1.fq.gz 100% 1499MB 103.8MB/s 00:14
ojiro_O3_2.fq.gz 100% 1546MB 105.3MB/s 00:14
ojiro_O4_1.fq.gz 100% 1865MB 109.5MB/s 00:17
ojiro_O4_2.fq.gz 100% 1921MB 106.7MB/s 00:18
ojiro_T1_1.fq.gz 100% 1891MB 109.8MB/s 00:17
ojiro_T1_2.fq.gz 100% 1986MB 108.9MB/s 00:18
ojiro_T2_1.fq.gz 100% 1550MB 106.6MB/s 00:14
ojiro_T2_2.fq.gz 100% 1618MB 107.8MB/s 00:15
ojiro_T3_1.fq.gz 100% 1724MB 108.1MB/s 00:15
ojiro_T3_2.fq.gz 100% 1796MB 107.5MB/s 00:16
ojiro_T4_1.fq.gz 100% 1505MB 105.0MB/s 00:14
ojiro_T4_2.fq.gz 100% 1563MB 106.8MB/s 00:14
ojiro_male_1.fq.gz 100% 1408MB 108.1MB/s 00:13
ojiro_male_2.fq.gz 100% 1470MB 109.4MB/s 00:13
:/Volumes/Elements_1/240529_RNAseq/RawData$ cd ../first_raw_read/
:/Volumes/Elements_1/240529_RNAseq/first_raw_read$ ls
femo-female_1.fastq.gz femo-larva_1.fastq.gz femo-male_1.fastq.gz ojiro-female_1.fastq.gz ojiro-larva_1.fastq.gz ojiro-male_1.fastq.gz ojiro_femo.md5sum
femo-female_2.fastq.gz femo-larva_2.fastq.gz femo-male_2.fastq.gz ojiro-female_2.fastq.gz ojiro-larva_2.fastq.gz ojiro-male_2.fastq.gz
:/Volumes/Elements_1/240529_RNAseq/first_raw_read$ scp /Volumes/Elements_1/240529_RNAseq/first_raw_read/ojiro* kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
ojiro-female_1.fastq.gz 100% 1049MB 89.2MB/s 00:11
ojiro-female_2.fastq.gz 100% 1108MB 109.2MB/s 00:10
ojiro-larva_1.fastq.gz 100% 1310MB 109.9MB/s 00:11
ojiro-larva_2.fastq.gz 100% 1350MB 86.9MB/s 00:15
ojiro-male_1.fastq.gz 100% 1022MB 105.2MB/s 00:09
ojiro-male_2.fastq.gz 100% 1059MB 100.6MB/s 00:10
ojiro_femo.md5sum 100% 810 81.2KB/s 00:00
:/Volumes/Elements_1/240529_RNAseq/first_raw_read$ 遺伝研にて解凍。これめちゃくちゃ時間かかる……。
kosukesano@at138:~/tools/for_braker/nama_data/Ojiro_RNAseq$ ls
ojiro-female_1.fastq.gz ojiro-male_2.fastq.gz ojiro_E3_1.fq.gz ojiro_H1_2.fq.gz ojiro_H4_1.fq.gz ojiro_L2_2.fq.gz ojiro_O1_1.fq.gz ojiro_O3_2.fq.gz ojiro_T2_1.fq.gz ojiro_T4_2.fq.gz
ojiro-female_2.fastq.gz ojiro_E1_1.fq.gz ojiro_E3_2.fq.gz ojiro_H2_1.fq.gz ojiro_H4_2.fq.gz ojiro_L3_1.fq.gz ojiro_O1_2.fq.gz ojiro_O4_1.fq.gz ojiro_T2_2.fq.gz ojiro_femo.md5sum
ojiro-larva_1.fastq.gz ojiro_E1_2.fq.gz ojiro_E4_1.fq.gz ojiro_H2_2.fq.gz ojiro_L1_1.fq.gz ojiro_L3_2.fq.gz ojiro_O2_1.fq.gz ojiro_O4_2.fq.gz ojiro_T3_1.fq.gz ojiro_male_1.fq.gz
ojiro-larva_2.fastq.gz ojiro_E2_1.fq.gz ojiro_E4_2.fq.gz ojiro_H3_1.fq.gz ojiro_L1_2.fq.gz ojiro_L4_1.fq.gz ojiro_O2_2.fq.gz ojiro_T1_1.fq.gz ojiro_T3_2.fq.gz ojiro_male_2.fq.gz
ojiro-male_1.fastq.gz ojiro_E2_2.fq.gz ojiro_H1_1.fq.gz ojiro_H3_2.fq.gz ojiro_L2_1.fq.gz ojiro_L4_2.fq.gz ojiro_O3_1.fq.gz ojiro_T1_2.fq.gz ojiro_T4_1.fq.gz
kosukesano@at138:~/tools/for_braker/nama_data/Ojiro_RNAseq$ rm ojiro_femo.md5sum
kosukesano@at138:~/tools/for_braker/nama_data/Ojiro_RNAseq$ ls
ojiro-female_1.fastq.gz ojiro-male_2.fastq.gz ojiro_E3_1.fq.gz ojiro_H1_2.fq.gz ojiro_H4_1.fq.gz ojiro_L2_2.fq.gz ojiro_O1_1.fq.gz ojiro_O3_2.fq.gz ojiro_T2_1.fq.gz ojiro_T4_2.fq.gz
ojiro-female_2.fastq.gz ojiro_E1_1.fq.gz ojiro_E3_2.fq.gz ojiro_H2_1.fq.gz ojiro_H4_2.fq.gz ojiro_L3_1.fq.gz ojiro_O1_2.fq.gz ojiro_O4_1.fq.gz ojiro_T2_2.fq.gz ojiro_male_1.fq.gz
ojiro-larva_1.fastq.gz ojiro_E1_2.fq.gz ojiro_E4_1.fq.gz ojiro_H2_2.fq.gz ojiro_L1_1.fq.gz ojiro_L3_2.fq.gz ojiro_O2_1.fq.gz ojiro_O4_2.fq.gz ojiro_T3_1.fq.gz ojiro_male_2.fq.gz
ojiro-larva_2.fastq.gz ojiro_E2_1.fq.gz ojiro_E4_2.fq.gz ojiro_H3_1.fq.gz ojiro_L1_2.fq.gz ojiro_L4_1.fq.gz ojiro_O2_2.fq.gz ojiro_T1_1.fq.gz ojiro_T3_2.fq.gz
ojiro-male_1.fastq.gz ojiro_E2_2.fq.gz ojiro_H1_1.fq.gz ojiro_H3_2.fq.gz ojiro_L2_1.fq.gz ojiro_L4_2.fq.gz ojiro_O3_1.fq.gz ojiro_T1_2.fq.gz ojiro_T4_1.fq.gz
kosukesano@at138:~/tools/for_braker/nama_data/Ojiro_RNAseq$ gunzip -rf ../Ojiro_RNAseq/遺伝研にて、オジロ用のBRAKERディレクトリを作った。
kosukesano@at138:~/tools/for_braker$ mkdir Ojiro
kosukesano@at138:~/tools/for_braker$ cd Ojiro
kosukesano@at138:~/tools/for_braker/Ojiro$ nano Ojiro_braker.sh1018
オジロの BRAKER続き
~/tools/for_braker/Ojiroで以下のスクリプトを書き、qsubで投げた。
### Ojiro_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241017_Ojiro_masked.fa\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=ojiro-female_1,ojiro-female_2,ojiro-male_1,ojiro-male_2,ojiro-larva_1,ojiro-larva_2,\
ojiro_E1_1,ojiro_E1_2,ojiro_E2_1,ojiro_E2_2,ojiro_E3_1,ojiro_E3_2,ojiro_E4_1,ojiro_E4_2,\
ojiro_H1_1,ojiro_H1_2,ojiro_H2_1,ojiro_H2_2,ojiro_H3_1,ojiro_H3_2,ojiro_H4_1,ojiro_H4_2,\
ojiro_L1_1,ojiro_L1_2,ojiro_L2_1,ojiro_L2_2,ojiro_L3_1,ojiro_L3_2,ojiro_L4_1,ojiro_L4_2,\
ojiro_O1_1,ojiro_O1_2,ojiro_O2_1,ojiro_O2_2,ojiro_O3_1,ojiro_O3_2,ojiro_O4_1,ojiro_O4_2,\
ojiro_T1_1,ojiro_T1_2,ojiro_T2_1,ojiro_T2_2,ojiro_T3_1,ojiro_T3_2,ojiro_T4_1,ojiro_T4_2\
--rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq\
--threads=16\
--species=Ojiro_241017\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
date
~また、intelノードが混んでそうだったので、gpuノード用のディレクトリ~/tools/for_braker/Ojiro/gputestも作成、以下のスクリプトを書いた。
### GPU_Ojiro_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241017_Ojiro_masked.fa\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=ojiro-female_1,ojiro-female_2,ojiro-male_1,ojiro-male_2,ojiro-larva_1,ojiro-larva_2,\
ojiro_E1_1,ojiro_E1_2,ojiro_E2_1,ojiro_E2_2,ojiro_E3_1,ojiro_E3_2,ojiro_E4_1,ojiro_E4_2,\
ojiro_H1_1,ojiro_H1_2,ojiro_H2_1,ojiro_H2_2,ojiro_H3_1,ojiro_H3_2,ojiro_H4_1,ojiro_H4_2,\
ojiro_L1_1,ojiro_L1_2,ojiro_L2_1,ojiro_L2_2,ojiro_L3_1,ojiro_L3_2,ojiro_L4_1,ojiro_L4_2,\
ojiro_O1_1,ojiro_O1_2,ojiro_O2_1,ojiro_O2_2,ojiro_O3_1,ojiro_O3_2,ojiro_O4_1,ojiro_O4_2,\
ojiro_T1_1,ojiro_T1_2,ojiro_T2_1,ojiro_T2_2,ojiro_T3_1,ojiro_T3_2,ojiro_T4_1,ojiro_T4_2\
--rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq\
--threads=16\
--species=Ojiro_241017_GPU\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
datePstrのBRAKER
ジョブは終わったけど結果のファイルが出力されてないぞ!?
### Pstr_braker.sh.o27030241の中身
start at
Tue Oct 15 13:32:47 JST 2024
# Tue Oct 15 13:32:54 2024: Log information is stored in file /lustre7/home/kosukesano/tools/for_braker/Pstr/braker/braker.log
#*********
# WARNING: in file /home/kosukesano/tools/braker_git_install/BRAKER/scripts/braker.pl at line 1413
file /lustre7/home/kosukesano/tools/for_braker/Pstr/braker/genome.fa contains a highly fragmented assembly (84140 scaffolds). This may lead to problems when running AUGUSTUS via braker in parallelized mode. You set --threads=16. You should run braker.pl in linear mode on such genomes, though (--threads=1).
#*********なんかコフキの時に見たエラーな気がする。とりあえずスレッド数を落としてもう一度行った。
1019
オジロのBRAKER終了!
gpuの方で行ったオジロのBRAKERが終了してた。
kosukesano@at138:~/tools/for_braker/Ojiro$ cd gputest/braker/
kosukesano@at138:~/tools/for_braker/Ojiro/gputest/braker$ ls
Augustus GeneMark-ETP braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff species what-to-cite.txtできたファイルはこんな感じ
kosukesano@at139:~/tools/for_braker/Ojiro/gputest/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 16,019 9,595,507 4 599 19,746
kosukesano@at139:~/tools/for_braker/Ojiro/gputest/braker$ オジロを含めた7種でのOrthofinder
まずオジロのヘッダーを修正する。修正用にbraker.aaをOjiro.fastaとしてコピー。
kosukesano@at138:~/tools/for_braker/Ojiro/gputest/braker$ cp braker.aa Ojiro.fasta
kosukesano@at138:~/tools/for_braker/Ojiro/gputest/braker$ ls
Augustus GeneMark-ETP Ojiro.fasta braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff species what-to-cite.txt続いて同じディレクトリでヘッダー書き換え用スクリプトedit.pyを作成。
### edit.pyの中身
### edit.pyの中身
import os
from Bio import SeqIO
# 入力ディレクトリと出力ディレクトリのパス
input_dir = '../braker'
output_dir = '../braker/RemakeHedder_Ojiro'
# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 入力ディレクトリ内のすべての .fasta ファイルを処理
for input_file in os.listdir(input_dir):
if input_file.endswith('.fasta'):
input_path = os.path.join(input_dir, input_file)
output_path = os.path.join(output_dir, input_file)
# 入力ファイルを読み込み、条件に基づいて書き換えた内容を出力ファイルに保存
with open(output_path, 'w') as outfile:
for record in SeqIO.parse(input_path, 'fasta'):
header = record.description
seq = str(record.seq)
# ヘッダーが「g」で始まる場合
if header.startswith("g"):
# 新しいヘッダーは「>Ojir」 + 「元のヘッダーの番号」
number = header.split()[0] # ヘッダーの最初の番号部分を取得
new_header = f">Ojir_{number}"
# ヘッダーが「]」で終わる場合
elif header.endswith("]"):
# ヘッダーの最後の「[]」内の英字を抽出
within_brackets = header.split('[')[-1].split(']')[0]
first_letter = within_brackets[0] # 最初の1文字
space_after = within_brackets.split()[-1][:3] # スペース後の3文字
# 元のヘッダーから最初の「>」の次の文字から最初の「 」までの部分を取得
first_part = header.split()[0][1:]
new_header = f">{first_letter}{space_after}_{first_part}"
else:
new_header = f">{header.split()[0]}"
# 新しいヘッダーと配列を出力ファイルに書き込む
outfile.write(f"{new_header}\n{seq}\n")
print(f"{output_path} に保存しました。")これを実行。
続いて~/tools/for_orthofinderディレクトリにて241019_6plusOjiroディレクトリを作成。ここにオジロや他6種のゲノムを持ってきた。
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$ cp ~/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_Ojiro/Ojiro.fasta ../241019_6plusOjiro/
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$ ls
Ojiro.fasta
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$ cp ../RemakeHedder_6sp/*.fasta ../241019_6plusOjiro/
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$ ls
Agra.fasta Cass.fasta Dpon.fasta Ojiro.fasta Smad.fasta Sory.fasta Tcas.fasta
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro$そして、~/tools/for_orthofinderディレクトリにてOrthofinder_241019.shを作成、qsubで投げた。
### Orthofinder_241019.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l intel
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
-f /home/kosukesano/tools/for_orthofinder/241019_6plusOjiro\
-t 16
dateノードをintelに、スロットを16に設定。10分くらいで終わった。
出力はこう
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19$ ls
Citation.txt Gene_Trees Orthogroups Phylogenetically_Misplaced_Genes Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics Log.txt Orthologues Putative_Xenologs Species_Tree
Gene_Duplication_Events Orthogroup_Sequences Phylogenetic_Hierarchical_Orthogroups Resolved_Gene_Trees WorkingDirectory
kosukesano@at139:~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19$ ls Orthogroups/
Orthogroups.GeneCount.tsv Orthogroups.tsv Orthogroups.txt Orthogroups_SingleCopyOrthologues.txt Orthogroups_UnassignedGenes.tsvオジロゲノムのBUSCO
BRAKER終わったらやっとかなきゃね。
~/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_OjiroディレクトリでOjiro_busco.shを作成、qsubで投げた。
### Ojiro_busco.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
-m protein\
-i /home/kosukesano/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_Ojiro/Ojiro.fasta\
-o /home/kosukesano/tools/for_braker/Ojiro/gputest/braker/BUSCO_OUTPUT_Ojiro\
-l\
/home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
-f
dateオジロゲノムを含めたPAML前準備、SCOの抽出
~/tools/for_paml/data/plusOjiroを作成し、ExOG.pyを書いた。
# ファイルパスの設定
orthogroups_file_path = '/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/Orthogroups/Orthogroups.txt'
single_copy_orthologues_file_path = '/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/Orthogroups/Orthogroups_SingleCopyOrthologues.txt'
output_file_path = '/home/kosukesano/tools/for_paml/data/plusOjiro/extracted_orthogroups.txt'
# シングルコピーオルソログのIDをセットに格納
single_copy_orthologues = set()
with open(single_copy_orthologues_file_path, 'r') as single_copy_file:
for line in single_copy_file:
single_copy_orthologues.add(line.strip())
# Orthogroups.txt から該当する行を抽出して新しいファイルに保存
with open(orthogroups_file_path, 'r') as orthogroups_file, open(output_file_path, 'w') as output_file:
for line in orthogroups_file:
# 行の最初の部分を取り出してIDをチェック
og_id = line.split(':')[0].strip()
if og_id in single_copy_orthologues:
output_file.write(line)これを実行するとextracted_orthogroups.txtができる
### extracted_orthogroups.txt
OG0008141: Agra_P_050292700.1 Cass_AG9761214.1 Dpon_P_019755574.2 Ojir_g1996.t1 Smad_g6358.t1 Sory_P_030761209.1 Tcas_P_008195282.1
OG0008142: Agra_P_050292731.1 Cass_AH1135743.1 Dpon_P_048519923.1 Ojir_g7978.t1 Smad_g2098.t1 Sory_P_030765758.1 Tcas_P_008196870.1
OG0008143: Agra_P_050292732.1 Cass_AG9767756.1 Dpon_P_019773495.1 Ojir_g6189.t1 Smad_g5269.t1 Sory_P_030765067.1 Tcas_P_015836383.1
OG0008144: Agra_P_050292739.1 Cass_AG9768060.1 Dpon_P_019769194.2 Ojir_g6137.t1 Smad_g11904.t1 Sory_P_030755089.1 Tcas_P_969265.1
OG0008145: Agra_P_050292743.1 Cass_AG9767942.1 Dpon_P_019767966.1 Ojir_g4737.t1 Smad_g4980.t1 Sory_P_030750408.1 Tcas_P_971491.1続いて、ヘッダーがタンパク質のものと揃っているアミノ酸CDSのファイルを揃える。~/tools/for_paml/data/plusOjiro_nama_data/Amino_seqディレクトリを作成、かつて作った分のファイルをコピー。
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data$ mkdir Amino_seq
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data$ cd Amino_seq/
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ ls
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ cp ../../241009_RemakeHedder_6sp_afterchange/.fasta ../Amino_seq/
cp: cannot stat '../../241009_RemakeHedder_6sp_afterchange/.fasta': No such file or directory
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ cp ../../241009_RemakeHedder_6sp_afterchange/*.fasta ../Amino_seq/
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ ls
Agra_changehedder.fasta Cass_changehedder.fasta Dpon_changehedder.fasta Smad_changehedder.fasta Sory_changehedder.fasta Tcas_changehedder.fastaまた、マダラもbraker.cordingseqをedit.pyで処置してコピー。
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ ls
Agra_changehedder.fasta Cass_changehedder.fasta Dpon_changehedder.fasta Ojir_changehedder.fasta Smad_changehedder.fasta Sory_changehedder.fasta Tcas_changehedder.fasta
kosukesano@at138:~/tools/for_paml/data/plusOjiro_nama_data/Amino_seq$ ~/tools/for_paml/data/plusOjiroディレクトリに集約させた方が良いのでは?mvで移動させた。
kosukesano@at138:~/tools/for_paml/data/plusOjiro$ mv ../plusOjiro_nama_data/ ../plusOjiro
kosukesano@at138:~/tools/for_paml/data/plusOjiro$ ls
ExOG.py extracted_orthogroups.txt plusOjiro_nama_dataこの後、new_makefna.pyを作成、実行した。
# 必要なモジュールをインポート
import os
# ファイルパスの設定
orthogroups_file = "extracted_orthogroups.txt"
input_dir = "/home/kosukesano/tools/for_paml/data/plusOjiro/plusOjiro_nama_data/Amino_seq"
output_dir = "/home/kosukesano/tools/for_paml/data/plusOjiro/CDS_SCO/"
# ディレクトリが存在しない場合、作成
os.makedirs(output_dir, exist_ok=True)
# OG番号と遺伝子IDをextracted_orthogroups.txtから取得
with open(orthogroups_file, "r") as ortho_f:
for line in ortho_f:
if line.strip(): # 空行を無視
# 行をOG番号と遺伝子IDに分割
og_number, gene_ids_str = line.split(":")
og_number = og_number.strip()
gene_ids = gene_ids_str.strip().split()
# 遺伝子IDを種ごとに分割
genes = {
"Agra": gene_ids[0],
"Cass": gene_ids[1],
"Dpon": gene_ids[2],
"Ojir": gene_ids[3],
"Smad": gene_ids[4],
"Sory": gene_ids[5],
"Tcas": gene_ids[6]
}
# 出力ファイルのパスを設定
output_file = os.path.join(output_dir, f"{og_number}.fna")
# 出力ファイルを開く
with open(output_file, "w") as out_f:
# 各種ごとに遺伝子IDを取得し、対応するファイルからシーケンスを検索
for species, gene_id in genes.items():
fasta_file = os.path.join(input_dir, f"{species}_changehedder.fasta")
with open(fasta_file, "r") as fasta_f:
write_flag = False
for line in fasta_f:
if line.startswith(f">{gene_id}"):
# ヘッダー行を見つけたら、出力ファイルに書き込みを開始
out_f.write(line)
print(line.strip()) # 標準出力にヘッダーを表示
write_flag = True
elif line.startswith(">") and write_flag:
# 次のヘッダー行が見つかったら、現在の遺伝子の書き込みを終了
write_flag = False
elif write_flag:
# シーケンス部分を書き込む
out_f.write(line)
print(line.strip()) # 標準出力にシーケンスを表示
print(f"{og_number}.fna ファイルが {output_dir} に保存されました。")出力
>Tcas_P_008199734.2
ATGGAAATCGAGAACAAATTAGACGAGGACTTCGTCTTCTACCTCGGCTTCGTCGGTACTTACTTCAAACATATCCGCGATAAAGACATTCGTCACCACTGCGAACAATGGTTGCTAAAACTCTGCGGGGAGCCTTGCCAAGGAATTGAAAAGAAACGAGGCCGCAATATCTACCTCTCACAACTCATTCTATGCATGCAAACTGGAATTTTGGGCAATGAATTTAAAGTTCCGGTCAACGAAGTCGATGTAGCGAATGCGACCCAGGTGTTCCAGCTGCAGCCCGAAGGAGAAGCATTTCAGACCCCAGGATGGTTGGAGGATAACGATGCTGATGTAGGTACTGCTGCCAGGAATGCAAAAACTGGGCGGACTTACGTGGCTACGCGTACATTGCCGGGAGGACAAGGGGCTTTTGCGTACGTTGCCGTCTCCTTAGACGAGGAAGAACCCAAGTGGTTGGGAGGCGGGGAAGGTGTTTTTGACCGGCATATGGAACAGAAGTTCAGGGAGGAGGTGCCCGATTATGAAATGGAAAAGATTCTAGCAAGGAGGAAAGATCCTAAAGAGCGGGAGAAGGTTATCACCTTCTATAAAGTCCTGTTGACAAATATTGAAGATGAGTTGGACGAGAAGATACATGCAGGTGAAAATGACACTGTTAATGGTCTTTTGGAGCAACTGGAACAAGATATGAGGGATCGTGGCCAGTTTGAACCATTCGCACACTTGAATGCTAAAGATTTAAGAAACGAACTTCTTCTAGTGCTACACGATCGCATTCAGCTAAGGATTAATAAAGTGATGAAACGTGAGGAACTTCTCGATGAAATTGAGAAAGGCATTCTTGCGAAATCATTCTTTGAAACCTCTGTAACGCCAGAAGACAAGTTCTTGTTACCCCCGGCAATGTGGGAGCAAGCTATCAATAAAATCCCCAACAAAAAACTGCTTGAGAAATTAAGGGATAACTATCCGATGATTCTAATAGAGAAATTCTTGAAGCTACTTTCTGATTATAAAGAAGAAATAGCTGTGAGAATGCACCGTCGACATGAAAACATCGCCGCGCAGATGAAGCGGGAGTTAAGACGTGAAGACGAGAAGGGGAAGAAGCTTGTCGAAGGTGCCCAAATCGCCTGCGACCACGCTACTGAGATTCTCAAGGCTGTCAAAGAAGCCTATACCACTAAGGCCGAAGTCGAGAGGAGAAATGCAGAGAAGGTTGCCATTCCAAAATCGGAGCATTCTGAGCTTTATGATCAAATGAGGGCCGCTTTGCTTGACACTCAGAAGTCTGTCGAGGATGAAGCGGCCAGAGGAAAAGTGTTGGCTGCTCAAATTGGAGAAATCAACGAACAGACTGAAATGTGTTTGAAAGTAACGGAAGAAAATGTCAGGAAGATTGAAGAGAAGAATATGGAAATAATGAAAAATATCAAGAGACTGAATGCCGCAATTGACAATCAGCAGAAGAGGATCGAAATGGTGCAGAAGGTGGGGGCGAAGAAGGGAAATCAGCTTGAATTTTTCTTTTAA
OG0009886.fna ファイルが /home/kosukesano/tools/for_paml/data/plusOjiro/CDS_SCO/ に保存されました。
kosukesano@at138:~/tools/for_paml/data/plusOjiro$ ls
CDS_SCO ExOG.py extracted_orthogroups.txt new_makefna.py plusOjiro_nama_data
kosukesano@at138:~/tools/for_paml/data/plusOjiro$ ls CDS_SCO/
OG0008141.fna OG0008289.fna OG0008453.fna OG0008599.fna OG0008749.fna OG0008891.fna OG0009031.fna OG0009169.fna OG0009312.fna OG0009459.fna OG0009598.fna OG0009748.fna
OG0008142.fna OG0008290.fna OG0008454.fna OG0008600.fna OG0008750.fna OG0008892.fna OG0009032.fna OG0009171.fna OG0009314.fna OG0009460.fna OG0009599.fna OG0009749.fna
OG0008143.fna OG0008292.fna OG0008457.fna OG0008601.fna OG0008752.fna OG0008893.fna OG0009033.fna OG0009172.fna OG0009315.fna OG0009462.fna OG0009601.fna OG0009751.fnaこれにMAFFTをかける。~/tools/for_paml/data/plusOjiroディレクトリでmafft.shを作成、qsubで実行。
#$ -S /bin/bash
source ~/tools/pyenv_env/ManualPhilo_profile
# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/data/plusOjiro/CDS_SCO/"
output_dir="/home/kosukesano/tools/for_paml/data/plusOjiro/CDS_SCO/"
# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
# 元のファイル名から拡張子を除いたものを取得
base_name=$(basename "$file" .fna)
# 出力ファイル名を生成
output_file="${output_dir}${base_name}_maffted.fna"
# MAFFTを実行
mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"
echo "Aligned file created: $output_file"
doneオジロゲノムを含めたCAFE前準備
Orthogroups.GeneCount.tsvとSpeciesTree_rooted.txtをローカルにコピー。
:~/bio/for_cafe/241019_orthofinder_data$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/Orthogroups/Orthogroups.GeneCount.tsv /Users/kosukesano/bio/for_cafe/241019_orthofinder_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Orthogroups.GeneCount.tsv 100% 387KB 4.1MB/s 00:00
:~/bio/for_cafe/241019_orthofinder_data$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/Species_Tree/SpeciesTree_rooted.txt /Users/kosukesano/bio/for_cafe/241019_orthofinder_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
SpeciesTree_rooted.txt 100% 189 10.2KB/s 00:00
:~/bio/for_cafe/241019_orthofinder_data$ ls
Orthogroups.GeneCount.tsv SpeciesTree_rooted.txt
:~/bio/for_cafe/241019_orthofinder_data$ ~/bio/for_cafe/にて241019_cafe前処理.Rを作成、実行した。
Orthologs_raw <- read_tsv(paste("/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/Orthogroups.GeneCount.tsv", sep = "/"))
##Enzanはorthogroupのなかで遺伝子数が変なやつを検出するためのmatrix
Enzan <- Orthologs_raw %>%
select(!c(Orthogroup, Total)) %>%
t()
##saidai, saisyouは各Orthogroupの中で、各種が持っているコピー数の最大値及び最小値を記したdf
saidai <- Enzan %>%
apply(2, max) %>%
as.data.frame() %>%
rename(max_real = ".")
saisyou <- Enzan %>%
apply(2, min) %>%
as.data.frame() %>%
rename(min_real = ".")
##Orthologs_1は各Orthogroupsの最大値、最小値もくっつけたdf
Orthologs_1 <- Orthologs_raw %>% select(!c(Total)) %>%
bind_cols(saidai, saisyou)
##最大値と最小値の差
Orthologs_2 <-Orthologs_1 %>%
mutate(sa = max_real - min_real) %>%
filter(max_real != min_real) %>%
filter(sa < 50)
##外れ値と遺伝子ファミリー数が全種で共通の行を省いた。最後に1列目を複製し列名をいじって、CAFEへのインプットデータの出来上がり。
Orthologs_3 <- Orthologs_2 %>%
mutate(Description = Orthogroup, ID = Orthogroup) %>%
relocate(Description, ID) %>%
select(!c(Orthogroup, max_real, min_real, sa))
#Orthologs_3 %>%
# write_tsv(paste("/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/Orthogroups.GeneCount2.tsv", sep = "/"))#, quote = FALSE) #,row.names = FALSE)
##Did you finish creating ultrametric tree with makeultrametric.R?
############
tree = read.tree("/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/SpeciesTree_rooted.txt")
mrca = getMRCA(tree, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
tree,
100000,
age.min = 152.3, # 推定分岐年代の最小値(MYA)
age.max = 236.2, # 推定分岐年代の最大値(MYA)
node = mrca, # getMRCAで指定したノード
S = 1,
tol = 1e-20,
CV = FALSE,
eval.max = 500,
iter.max = 500
)
is.ultrametric(tree2) # ultrametricかどうか確認[1] TRUE
#write.tree(tree2, file = "/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/tree_ultrametric.nwk") # ultrametric系統樹の保存こうしてできたOrthogroups.GeneCount2.tsvとtree_ultrametric.nwkをDDBJの~/tools/for_cafe/241019_plusOjiroに転送した。
:~/bio/for_cafe/241019_orthofinder_data$ scp /Users/kosukesano/bio/for_cafe/241019_orthofinder_data/Orthogroups.GeneCount2.tsv kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/241019_plusOjiro
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Orthogroups.GeneCount2.tsv 100% 438KB 7.0MB/s 00:00
:~/bio/for_cafe/241019_orthofinder_data$ scp /Users/kosukesano/bio/for_cafe/241019_orthofinder_data/tree_ultrametric.nwk kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/241019_plusOjiro
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
tree_ultrametric.nwk 100% 217 15.9KB/s 00:00
:~/bio/for_cafe/241019_orthofinder_data$ これを用いてCAFE5を実行
kosukesano@at139:~/tools/for_cafe/241019_plusOjiro$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk
Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk
Filtering families not present at the root from: 13167 to 8317
No root family size distribution specified, using uniform distribution
Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1
Starting Search for Initial Parameter Values
Lambda: 0.0019787717120458
Score (-lnL): 125535.74185327
Lambda: 0.0019787717120458
Score (-lnL): 125535.74185327
Lambda: 0.0020777102976481
Score (-lnL): 125409.06131684
Lambda: 0.0021766488832504
Score (-lnL): 125339.43286286
Lambda: 0.0022755874688527
Score (-lnL): 125320.25002694
Lambda: 0.0024734646400572
Score (-lnL): 125410.58799608
Lambda: 0.0023745260544549
Score (-lnL): 125345.70788451
Lambda: 0.0021766488832504
Score (-lnL): 125339.43286286
Lambda: 0.0023745260544549
Score (-lnL): 125345.70788451
Lambda: 0.0023250567616538
Score (-lnL): 125327.7598648
Lambda: 0.0022261181760515
Score (-lnL): 125323.898933
Lambda: 0.0023250567616538
Score (-lnL): 125327.7598648
Lambda: 0.0023003221152532
Score (-lnL): 125322.66492998
Lambda: 0.0022508528224521
Score (-lnL): 125320.63456673
Lambda: 0.0023003221152532
Score (-lnL): 125322.66492998
Lambda: 0.0022879547920529
Score (-lnL): 125321.11212858
Lambda: 0.0022632201456524
Score (-lnL): 125320.0882598
Lambda: 0.0022508528224521
Score (-lnL): 125320.63456673
Lambda: 0.0022570364840522
Score (-lnL): 125320.27259376
Lambda: 0.0022694038072525
Score (-lnL): 125320.08098691
Lambda: 0.0022755874688527
Score (-lnL): 125320.25002694
Lambda: 0.0022724956380526
Score (-lnL): 125320.143545
Lambda: 0.0022663119764524
Score (-lnL): 125320.06260189
Lambda: 0.0022632201456524
Score (-lnL): 125320.0882598
Lambda: 0.0022647660610524
Score (-lnL): 125320.06989495
Lambda: 0.0022678578918525
Score (-lnL): 125320.0662948
Lambda: 0.0022647660610524
Score (-lnL): 125320.06989495
Lambda: 0.0022655390187524
Score (-lnL): 125320.0648584
Lambda: 0.0022670849341525
Score (-lnL): 125320.06310437
Lambda: 0.0022655390187524
Score (-lnL): 125320.0648584
Lambda: 0.0022659254976024
Score (-lnL): 125320.06338383
Lambda: 0.0022666984553025
Score (-lnL): 125320.0625129
Lambda: 0.0022670849341525
Score (-lnL): 125320.06310437
Lambda: 0.0022668916947275
Score (-lnL): 125320.06272849
Lambda: 0.0022665052158774
Score (-lnL): 125320.06247031
Lambda: 0.0022663119764524
Score (-lnL): 125320.06260189
Lambda: 0.0022664085961649
Score (-lnL): 125320.06251424
Lambda: 0.00226660183559
Score (-lnL): 125320.06246987
Completed 17 iterations
Time: 0H 0M 3S
Best match is: 0.00226660183559
Final -lnL: 125320.06246987
38 values were attempted (0% rejected)
Inferring processes for Base model
Score (-lnL): 125320.06246987
Maximum possible lambda for this topology: 0.004233700254022
Computing pvalues...
done!
Starting reconstruction processes for Base model
Done!
kosukesano@at139:~/tools/for_cafe/241019_plusOjiro$ 結果はこんな感じ
kosukesano@at139:~/tools/for_cafe/241019_plusOjiro$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk
Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk
Filtering families not present at the root from: 13167 to 8317
No root family size distribution specified, using uniform distribution
Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1
Starting Search for Initial Parameter Values
Lambda: 0.0019787717120458
Score (-lnL): 125535.74185327
Lambda: 0.0019787717120458
Score (-lnL): 125535.74185327
Lambda: 0.0020777102976481
Score (-lnL): 125409.06131684
Lambda: 0.0021766488832504
Score (-lnL): 125339.43286286
Lambda: 0.0022755874688527
Score (-lnL): 125320.25002694
Lambda: 0.0024734646400572
Score (-lnL): 125410.58799608
Lambda: 0.0023745260544549
Score (-lnL): 125345.70788451
Lambda: 0.0021766488832504
Score (-lnL): 125339.43286286
Lambda: 0.0023745260544549
Score (-lnL): 125345.70788451
Lambda: 0.0023250567616538
Score (-lnL): 125327.7598648
Lambda: 0.0022261181760515
Score (-lnL): 125323.898933
Lambda: 0.0023250567616538
Score (-lnL): 125327.7598648
Lambda: 0.0023003221152532
Score (-lnL): 125322.66492998
Lambda: 0.0022508528224521
Score (-lnL): 125320.63456673
Lambda: 0.0023003221152532
Score (-lnL): 125322.66492998
Lambda: 0.0022879547920529
Score (-lnL): 125321.11212858
Lambda: 0.0022632201456524
Score (-lnL): 125320.0882598
Lambda: 0.0022508528224521
Score (-lnL): 125320.63456673
Lambda: 0.0022570364840522
Score (-lnL): 125320.27259376
Lambda: 0.0022694038072525
Score (-lnL): 125320.08098691
Lambda: 0.0022755874688527
Score (-lnL): 125320.25002694
Lambda: 0.0022724956380526
Score (-lnL): 125320.143545
Lambda: 0.0022663119764524
Score (-lnL): 125320.06260189
Lambda: 0.0022632201456524
Score (-lnL): 125320.0882598
Lambda: 0.0022647660610524
Score (-lnL): 125320.06989495
Lambda: 0.0022678578918525
Score (-lnL): 125320.0662948
Lambda: 0.0022647660610524
Score (-lnL): 125320.06989495
Lambda: 0.0022655390187524
Score (-lnL): 125320.0648584
Lambda: 0.0022670849341525
Score (-lnL): 125320.06310437
Lambda: 0.0022655390187524
Score (-lnL): 125320.0648584
Lambda: 0.0022659254976024
Score (-lnL): 125320.06338383
Lambda: 0.0022666984553025
Score (-lnL): 125320.0625129
Lambda: 0.0022670849341525
Score (-lnL): 125320.06310437
Lambda: 0.0022668916947275
Score (-lnL): 125320.06272849
Lambda: 0.0022665052158774
Score (-lnL): 125320.06247031
Lambda: 0.0022663119764524
Score (-lnL): 125320.06260189
Lambda: 0.0022664085961649
Score (-lnL): 125320.06251424
Lambda: 0.00226660183559
Score (-lnL): 125320.06246987
Completed 17 iterations
Time: 0H 0M 3S
Best match is: 0.00226660183559
Final -lnL: 125320.06246987
38 values were attempted (0% rejected)
Inferring processes for Base model
Score (-lnL): 125320.06246987
Maximum possible lambda for this topology: 0.004233700254022
Computing pvalues...
done!
Starting reconstruction processes for Base model
Done!
kosukesano@at139:~/tools/for_cafe/241019_plusOjiro$ マダラゲノムを使用した系統樹作成
~/tools/for_IQTREE/241019_plusOjiroディレクトリでIQTREE_1.pyを作成、実行した。
### IQTREE_1.py
##analysis_manual.pptxの#46も参照
##AFTER you made MSA file(all_seq.fa) in DDBJ with makeMSA.sh
##時間は10secほど
import numpy as np
import pandas as pd
import os
path = "/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/"
withpath = "../../for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/"
OGs = pd.read_table(path + "Orthogroups/Orthogroups.tsv")
# ManualPhylo_dataディレクトリが存在しない場合は作成
os.makedirs(path + "ManualPhylo_data", exist_ok=True)
##with openは相対パスしか受け付けないらしい
new = pd.DataFrame()
with open(withpath + "Orthogroups/Orthogroups_SingleCopyOrthologues.txt", "r") as fin:
for line in fin:
li = line.rstrip()
new = pd.concat([new, OGs[OGs["Orthogroup"] == li]])
print(new)
new.to_csv(path + "ManualPhylo_data/OG_list.txt", sep = " ", index = False, header = False)
##OG_list.txtと同じ順番の種名リストであるspecies_list.txtを作成
##できたOG_list.txtに、DDBJで作ったall_seq.faで配列情報を与える。
li = []
allspe = OGs.columns.tolist()
allspe2 = allspe[1:len(allspe)]
with open(withpath + "ManualPhylo_data/species_list.txt", "w") as file:
for column_name in allspe2:
file.write("%s\n" % column_name)続いてconcatinate.shを作成、実行した。
#$ -S /bin/bash
#$ -cwd
echo start at
date
# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro" ## Please replace with the actual directory containing the fasta files
# Define the output directory and output file
new="/home/kosukesano/tools/for_IQTREE/241019_plusOjiro"
mkdir -p $new
# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
cat "$file" >> "${new}/all_seq.fa"
done
dateこれによりall_seq.faができた。
次にIQTREE_2.pyを作り、実行。
### IQTREE_2.pyの中身
import sys
from Bio import SeqIO
path = "../../for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data/"
fasta_in = sys.argv[1] #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2] #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する
for q in open(query_in, "r"): #オーソログファイルを開いて1行づつ読み込む
query = q.split() #スペース毎に切りとってリスト形式でqueryに保存する
f = open(path + query[0], 'w') #最初の列(OG名)と同じ名前のファイルを作成する
for record in SeqIO.parse(fasta_in, 'fasta'): #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
id_part = record.id #fastaのID部分を読み込む
desc_part = record.description #fastaのdescription部分を読み込む
seq = record.seq #fastaの配列部分を読み込む
for i in range(len(query)): #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
if desc_part == query[i] : #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
fasta_seq = '>' + desc_part + '\n' + seq + '\n' #fasta形式に整え
print(fasta_seq) #標準出力にfastaを出力(進行状況把握用)
f.write(str(fasta_seq)) #各OGファイルにfastaを出力
f.close()実行のコマンドは以下
python IQTREE_2.py all_seq.fa ../../for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data/OG_list.txt 結構時間かかる
1020
~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_dataでalign.shを作成した。
### align.shの中身
#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
mafft --auto $x > $x.maffted.fa
trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -keepheader -htmlout $x.maffted.trimed.fa.html -automated1
done以下のコマンドで実行
(MPT) kosukesano@at138:~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data$ sh align.sh OG_list.txt これにより*.maffted.faと*.maffted.trimed.faができた。
続いて、同じディレクトリでmakerun.pyを作る。
### makerun.pyの中身
import glob
import os
list = []
for i in glob.glob('*.maffted.trimed.fa'):
list.append(os.path.split(i)[1].rstrip())
#print(list[0])
##ls | grep "maffted.trimed.edit.fa" > otamesi.txtで、完成したOGをotamesi.txtに一行ずつ保存
##ファイルの行数をカウント。このカウント数がfor文のrangeに入る数になる
f = open("run.nex", "w")
f.write("#nexus" + "\n")
f.write("begin sets;" + "\n")
character = "charset part"
for line, i in zip(list, range(4997)):
row = character + str(i+1) + " = " + line + ": ;"
f.write("\t" + row + "\n")
f.write("end;" + "\n")
f.close()これでrun.nexが出力される。
続いてIQ-TREEの実行。使ったシェルスクリプトはmanualphylo.sh
### manualphylo.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
date
singularity exec -e /usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0 iqtree2 -sp run.nex -nt AUTO -bb 1000 -cptime 600
dateASTRALの実行
同じディレクトリでmakealltree.shを書いた。
### makealltree.sh
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"
# 作業ディレクトリに移動
cd ~/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data
# 出力ファイル
output_file="all_trees.nwk"
# 既存の出力ファイルを削除
if [ -f $output_file ]; then
rm $output_file
fi
# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.fa; do
# ファイル名から拡張子を除いたベース名を取得
base_name=$(basename $file .maffted.trimed.fa)
# Singularityを使用してIQ-TREEを実行して系統樹を作成
singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}
# 作成された系統樹ファイル (.treefile) を output_file に追加
if [ -f ${base_name}.treefile ]; then
echo -n "${base_name}: " >> $output_file
cat ${base_name}.treefile >> $output_file
echo "" >> $output_file
else
echo "Error: ${base_name}.treefile not found" >&2
fi
done
echo "All trees have been written to $output_file"
dateこれをqsubで投げた。
branchモデルの解釈
A_FN=read.csv("/Users/kosukesano/bio/for_paml/241019_branch/ASTRAL_free_vs_neut_lrt_results_with_w_ratios.txt", sep="\t")|>
rename(A_FN_OG_num = OG_num, A_FN_p_val = p_val, A_FN_significant = positive_selection, A_FN_w_ratio=Smad_w_ratio)|>
dplyr::filter(A_FN_significant == "+") |>
dplyr::filter(A_FN_w_ratio != "None")
A_FM=read.csv("/Users/kosukesano/bio/for_paml/241019_branch/ASTRAL_free_vs_M0_lrt_results_with_w_ratios.txt", sep="\t")|>
rename(A_FN_OG_num = OG_num, A_FM_p_val = p_val, A_FM_significant = positive_selection, A_FM_w_ratio=Smad_w_ratio)|>
dplyr::filter(A_FM_significant == "+") |>
dplyr::filter(A_FM_w_ratio != "None")
A_branch=dplyr::full_join(A_FN, A_FM, by = "A_FN_OG_num")
# 1018行
I_FN=read.csv("/Users/kosukesano/bio/for_paml/241019_branch/IQTREE_free_vs_neut_lrt_results_with_w_ratios.txt", sep="\t")|>
rename(I_FN_OG_num = OG_num, I_FN_p_val = p_val, I_FN_significant = positive_selection, I_FN_w_ratio=Smad_w_ratio)|>
dplyr::filter(I_FN_significant == "+") |>
dplyr::filter(I_FN_w_ratio != "None")
I_FM=read.csv("/Users/kosukesano/bio/for_paml/241019_branch/IQTREE_free_vs_M0_lrt_results_with_w_ratios.txt", sep="\t")|>
rename(I_FN_OG_num = OG_num, I_FM_p_val = p_val, I_FM_significant = positive_selection, I_FM_w_ratio=Smad_w_ratio)|>
dplyr::filter(I_FM_significant == "+") |>
dplyr::filter(I_FM_w_ratio != "None")
I_branch=dplyr::full_join(I_FN, I_FM, by = "I_FN_OG_num")
# 905行
branch=dplyr::full_join(A_branch, I_branch, by = c(A_FN_OG_num = "I_FN_OG_num"))
orthogroups_file <- "/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv"
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim(orthogroups_file, header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")|>
rename(gene_ID = V5)|>
dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", ""))
signal=read.csv("/Users/kosukesano/bio/out_madara_SP.txt", sep="\t")|>
dplyr::select("X..SignalP.5.0", "Organism..Eukarya")|>
rename(gene_ID = X..SignalP.5.0, signal_seq = Organism..Eukarya)
branch2=dplyr::left_join(branch, orthogroups, by = c(A_FN_OG_num = "V1"))|>
dplyr::left_join(fa, by = c(gene_ID = "Madara"))|>
dplyr::left_join(deg_all, by = "gene_ID")|>
dplyr::left_join(signal, by = "gene_ID")b_free VS b_neutの比較(特定の枝でdN/dSが1より大きいか)
A_FN_p_val A_FN_w_ratio gene_ID
1 2.292090e-04 2.83305 g10787.t1
2 4.515303e-14 1.89794 g9945.t1
Sory_GeneFunction
1 probable tRNA N6-adenosine threonylcarbamoyltransferase, mitochondrial
2 laminin subunit alpha
ovary.body_log2FC ovary.body_adjPval adult.llarva_log2FC adult.llarva_adjPval
1 2.542604 3.48e-17 NA NA
2 -1.616277 9.89e-06 NA NA
adult.mlarva_log2FC adult.mlarva_adjPval signal_seq
1 NA NA OTHER
2 NA NA SP(Sec/SPI)
2つの遺伝子について、マダラの枝でdN/dSが1より大きい。これらはbranch_siteでも取れた遺伝子。IQTREEについても同様だった。
A_FN_p_val A_FN_w_ratio gene_ID
1 1.115680e-04 0.00416139 g6560.t1
2 1.775299e-10 0.0266867 g6092.t1
3 1.507048e-08 0.0288913 g1585.t1
4 3.564588e-02 0.0545569 g12688.t1
5 2.624653e-02 0.0694119 g11623.t1
6 2.751868e-04 0.0468634 g8108.t1
Sory_GeneFunction ovary.body_log2FC
1 myosin-1-like -2.8040404
2 U2 snRNP-associated SURP motif-containing protein NA
3 conserved oligomeric Golgi complex subunit 6 NA
4 transmembrane emp24 domain-containing protein bai 0.5153544
5 isocitrate dehydrogenase -1.1550132
6 adenosylhomocysteinase 1.4553454
ovary.body_adjPval adult.llarva_log2FC adult.llarva_adjPval
1 7.398570e-04 2.381792 0.000534682
2 NA NA NA
3 NA NA NA
4 6.420052e-03 NA NA
5 2.580000e-05 NA NA
6 4.010000e-16 NA NA
adult.mlarva_log2FC adult.mlarva_adjPval signal_seq
1 2.908176 2.28e-05 OTHER
2 NA NA OTHER
3 NA NA OTHER
4 NA NA SP(Sec/SPI)
5 NA NA OTHER
6 NA NA OTHER
920つの遺伝子について、マダラの枝でdN/dSが1より小さい(はじめの6行のみ表示)。IQTREE版では958遺伝子で、ASTRALの結果を完全に包含する。
見たいのは小さい方だけど、ちょっと多いな……。
ShinyGOでGO解析してみる。
bn_neg_go=read.csv("/Users/kosukesano/bio/for_shinygo/241020_bn_neg_enrichment_all.csv", sep=",")|>
print() Enrichment.FDR nGenes Pathway.Genes Fold.Enrichment
1 1.111133e-15 41 133 4.731786
2 5.275277e-06 16 52 4.722908
3 4.448801e-05 115 1151 1.533611
4 4.448801e-05 25 133 2.885235
5 5.147560e-04 21 119 2.708727
6 5.764535e-04 11 40 4.221099
7 1.320669e-03 10 37 4.148500
8 1.477654e-03 10 38 4.039329
9 3.373044e-03 19 121 2.410244
10 5.177854e-03 8 30 4.093187
11 9.279724e-03 18 123 2.246261
12 1.530829e-02 15 99 2.325674
13 1.847989e-02 13 82 2.433449
14 2.118504e-02 6 23 4.004204
15 2.188038e-02 7 31 3.466005
16 2.700478e-02 11 68 2.482999
17 4.775602e-02 4 13 4.722908
18 7.527918e-02 17 145 1.799591
19 7.932791e-02 12 91 2.024103
20 8.496575e-02 5 24 3.197802
21 8.496575e-02 6 33 2.790809
22 8.496575e-02 6 33 2.790809
23 9.341023e-02 6 34 2.708727
24 1.854880e-01 12 107 1.721434
25 1.854880e-01 4 21 2.923705
Pathway
1 Path:dme03010 Ribosome
2 Path:dme03050 Proteasome
3 Path:dme01100 Metabolic pathways
4 Path:dme04141 Protein processing in endoplasmic reticulum
5 Path:dme03040 Spliceosome
6 Path:dme03022 Basal transcription factors
7 Path:dme00510 N-Glycan biosynthesis
8 Path:dme03420 Nucleotide excision repair
9 Path:dme01200 Carbon metabolism
10 Path:dme00513 Various types of N-glycan biosynthesis
11 Path:dme04144 Endocytosis
12 Path:dme03013 Nucleocytoplasmic transport
13 Path:dme03008 Ribosome biogenesis in eukaryotes
14 Path:dme03060 Protein export
15 Path:dme03020 RNA polymerase
16 Path:dme01230 Biosynthesis of amino acids
17 Path:dme00534 Glycosaminoglycan biosynthesis-heparan sulfate/heparin
18 Path:dme00190 Oxidative phosphorylation
19 Path:dme04146 Peroxisome
20 Path:dme00030 Pentose phosphate pathway
21 Path:dme00280 Valine leucine and isoleucine degradation
22 Path:dme03250 Viral life cycle-HIV-1
23 Path:dme03030 DNA replication
24 Path:dme04120 Ubiquitin mediated proteolysis
25 Path:dme04130 SNARE interactions in vesicular transport
URL
1 http://www.genome.jp/kegg-bin/show_pathway?dme03010
2 http://www.genome.jp/kegg-bin/show_pathway?dme03050
3 http://www.genome.jp/kegg-bin/show_pathway?dme01100
4 http://www.genome.jp/kegg-bin/show_pathway?dme04141
5 http://www.genome.jp/kegg-bin/show_pathway?dme03040
6 http://www.genome.jp/kegg-bin/show_pathway?dme03022
7 http://www.genome.jp/kegg-bin/show_pathway?dme00510
8 http://www.genome.jp/kegg-bin/show_pathway?dme03420
9 http://www.genome.jp/kegg-bin/show_pathway?dme01200
10 http://www.genome.jp/kegg-bin/show_pathway?dme00513
11 http://www.genome.jp/kegg-bin/show_pathway?dme04144
12 http://www.genome.jp/kegg-bin/show_pathway?dme03013
13 http://www.genome.jp/kegg-bin/show_pathway?dme03008
14 http://www.genome.jp/kegg-bin/show_pathway?dme03060
15 http://www.genome.jp/kegg-bin/show_pathway?dme03020
16 http://www.genome.jp/kegg-bin/show_pathway?dme01230
17 http://www.genome.jp/kegg-bin/show_pathway?dme00534
18 http://www.genome.jp/kegg-bin/show_pathway?dme00190
19 http://www.genome.jp/kegg-bin/show_pathway?dme04146
20 http://www.genome.jp/kegg-bin/show_pathway?dme00030
21 http://www.genome.jp/kegg-bin/show_pathway?dme00280
22 http://www.genome.jp/kegg-bin/show_pathway?dme03250
23 http://www.genome.jp/kegg-bin/show_pathway?dme03030
24 http://www.genome.jp/kegg-bin/show_pathway?dme04120
25 http://www.genome.jp/kegg-bin/show_pathway?dme04130
Genes
1 RpS26 mRpL13 mRpL20 RpL28 RpL21 mRpL23 RpS10b RpL13A RpL5 RpL10 RpS3A mRpL9 RpL12 RpL7A mRpL11 RpS9 RpL23 RpS17 RpS16 RpLP1 bonsai RpL13 RpL7 mRpL12 mRpL15 RpL4 RpS2 RpL14 RpL18A RpS25 RpL24-like RpL26 RpL36A RpL22 RpL36 mRpL2 tko RpS15 RpL18 RpS11 mRpL21
2 Rpt5 Rpn5 Rpn2 Prosbeta7 Prosbeta5 Rpt3 Rpn11 CG30382 Prosbeta2 Rpn8 Rpt4 Prosbeta6 Rpn12 Rpn3 Prosalpha6 Prosalpha2
3 Prat2 ttv CG10166 CG10425 amd ScsbetaG Ddc Sc2 ATPsynbeta Sirt7 Ahcy beta4GalT7 Alg9 alpha-Man-Ib PIG-B nSMase sro PIG-C ND-30 CG12338 ND-B14.5B Alg2 PIG-U S-Lap7 Sgsh CG15093 botv Stt3A GCS1 CG1673 p23 Glo1 ScpX CG17333 Eno FIG4 CG18003 Alg1 Aprt Gk1 FeCH Pgant6 Gclc CG2767 Taldo Tdc2 Rpe Prx6005 Pgk fbp Sirt4 ND-ASHI Hsepi Alg10 ND-MLRQ Mccc2 AsnS AdSL ND-19 Pgd Paics Fum1 UQCR-11 UQCR-C2 Got2 Hmgs Mipp2 CG44243 PIG-L CTPsyn Mtpbeta ATPsynF Agpat4 Cyt-c1 CG5009 Las RnrL Pi3K59F Ppcs Ppox Men-b P5cr ND-39 ATPsynD Dak1 rt CG6218 OstDelta GlcT Idh3b CG6638 Hacd1 CG6910 VhaPPA1-1 PyK GlyP CG7461 Pgant35A Stt3B Cds Vha36-1 ATPsynB Sps1 trx CG8665 CG8745 Sply Ost48 mtm ND-51 ND-ACP mAcon1 Amacr AcCoAS CG9886
4 CG10973 alpha-Man-Ib Der-2 Stt3A GCS1 Roc1a Rad23 l(1)G0320 CG4603 Plap Gint3 Gp93 CG5823 CG5885 OstDelta Sec13 wbl eff Stt3B Hsc70-2 CG7945 P58IPK ERp60 Ost48 Calr
5 l(2)37Cb PQBP1 CG17768 Hpr1 noi l(1)G0007 hoip snf BCAS2 Prp19 CG6015 U4-U6-60K CG6418 CG6841 Prp31 CG7483 Hsc70-2 Prp3 Bx42 snRNP-U1-70K CG9346
6 TfIIEalpha TfIIEbeta Cdk7 Taf11 TfIIB TfIIFbeta Taf8 CycH Mat1 hay Tfb1
7 Alg9 alpha-Man-Ib Alg2 Stt3A GCS1 Alg1 Alg10 OstDelta Stt3B Ost48
8 Roc1a Rad23 Cdk7 mei-9 CycH Mat1 hay Tfb1 PCNA RPA2
9 ScsbetaG CG15093 CG17333 Eno CG18003 Taldo Rpe Pgk fbp Pgd Fum1 Got2 CG5009 Men-b Idh3b PyK mAcon1 AcCoAS CG9886
10 Alg9 alpha-Man-Ib Alg2 Stt3A Alg1 OstDelta Stt3B Ost48
11 cpa Vps36 Arpc2 Arf102F Cdc42 Vps25 cpb Snx1 Chmp1 Arpc4 Vps60 Snx3 AP-2mu Hsc70-2 Vta1 Arf51F Vps45 TSG101
12 Karybeta3 Nxt1 Hpr1 Tnpo-SR Pym thoc6 Bin1 Nup35 Nup107 Sec13 cdm CG7483 Phax Kr-h2 Kap-alpha3
13 Rat1 Nxt1 Ns3 eIF6 CG2972 Nop60B hoip Ns1 Ns2 CG8064 CG8549 Non1 CG9107
14 Spase25 SrpRalpha Srp54k Srp14 Srp72 CG9240
15 l(2)37Cg CG12267 RpI12 RpII15 CG3756 RpI135 RpII33
16 CG1673 Eno Taldo Rpe Pgk AsnS Got2 P5cr Idh3b PyK mAcon1
17 beta4GalT7 botv Hsepi Hs6st
18 ATPsynbeta ND-30 ND-B14.5B ND-ASHI ND-MLRQ ND-19 UQCR-11 UQCR-C2 ATPsynF Cyt-c1 ND-39 ATPsynD VhaPPA1-1 Vha36-1 ATPsynB ND-51 ND-ACP
19 Hacl CG12338 CG14778 CG1662 ScpX CG18003 CG5009 Pex19 Pex3 Prx5 Sod3 Amacr
20 CG17333 Taldo Rpe fbp Pgd
21 CG15093 CG1673 Mccc2 Hmgs Mtpbeta CG6638
22 spt4 Tnpo-SR Su(Tpl) Cdk9 BicD TSG101
23 CG11164 dpa Mcm5 Mcm7 PCNA RPA2
24 Vhl Uba3 Cul5 Roc1a UbcE2H CG2924 fzy Prp19 CG5823 eff Uba2 CG7747
25 Sec20 Vti1a Syx5 Gos28
結構取れた。
特に
- 4_小胞体におけるタンパク質の処理
- 14_タンパク質輸送
- 25_小胞輸送におけるSNARE相互作用
あたりは物質分泌に関わっていそう
CAFE5で増幅が検出された遺伝子をShinyGOによりGO解析にかける
出力ファイルはこんな感じ。
cafe_go=read.csv("/Users/kosukesano/bio/for_shinygo/241020_cafe_shinygo.csv", sep=",")|>
print() Enrichment.FDR nGenes Pathway.Genes Fold.Enrichment
1 0.09676809 1 24 19.40000
2 0.09676809 1 35 13.30286
3 0.09676809 1 32 14.55000
Pathway
1 Path:dme00030 Pentose phosphate pathway
2 Path:dme00052 Galactose metabolism
3 Path:dme00500 Starch and sucrose metabolism
URL Genes
1 http://www.genome.jp/kegg-bin/show_pathway?dme00030 Gld
2 http://www.genome.jp/kegg-bin/show_pathway?dme00052 Mal-A8
3 http://www.genome.jp/kegg-bin/show_pathway?dme00500 Mal-A8
取れてきた経路は以下の通り。
1021
オジロを含めたIQ-TREE続き
各ファイルのヘッダーに遺伝子名が入っていて、別種扱いになって出力ファイルが出なかった。
~/tools/for_IQTREE/241019_plusOjiroでIQTREE_3.pyを作成、実行した。
### IQTREE_3.pyの中身
import os
# ファイルのヘッダーを変更する関数
def modify_headers(input_file, output_file):
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
if line.startswith(">"):
# ヘッダー行の最初の四文字を抽出して書き換え
outfile.write(f">{line[1:5]}\n")
else:
outfile.write(line)
# 作業ディレクトリ内のすべての ".maffted.trimed.fa" ファイルに対して処理を適用し、出力を別ディレクトリに保存
def process_directory(input_directory, output_directory):
if not os.path.exists(output_directory):
os.makedirs(output_directory)
for filename in os.listdir(input_directory):
if filename.endswith(".maffted.trimed.fa"):
input_file = os.path.join(input_directory, filename)
output_file = os.path.join(output_directory, filename.replace(".maffted.trimed.fa", ".maffted.trimed.edit.fa"))
modify_headers(input_file, output_file)
print(f"Processed: {filename}")
# 実行するディレクトリを指定
input_directory = "/home/kosukesano/tools/for_orthofinder/241019_6plusOjiro/OrthoFinder/Results_Oct19/ManualPhylo_data"
output_directory = "/home/kosukesano/tools/for_IQTREE/241019_plusOjiro"
process_directory(input_directory, output_directory)ここでmakerun.pyとmanualphylo.shをもう一度作成し、実行した。
1030
進化解析で使うゾウムシの選定
Dendroctonus ponderosaeと同族の種
これらはどちらもBRAKERをかけてみて、どっちかorどっちも使う。
なお、Ceutorhynchus assimilisおよびその亜科であるCeutorhynchinae(サルゾウムシ亜科)に該当する種はNCBIには登録されていなかった……。
とりあえずこの2種のゲノムを遺伝研に送ってソフトマスク
Dfroについて
:~/Downloads$ scp /Users/kosukesano/Downloads/Dfro.zip kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data/Dfro_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Dfro.zip 100% 52MB 90.6MB/s 00:00 >CM078935.1 Dendroctonus frontalis isolate MC-2024a chromosome 1, whole genome shotgun sequence
AAGTTGTAACGATGTACTGTTCTGTTTAAGCTCCGATAGTTCTTCGTCGGATTGTTGTGCTTGACATAGTTCTGCTGTAC
TGACTATGACAGGAAAAGCTATTGCATCAATGCGAGAAAGTGCATCAGCAATAACGTTTTCAACACCAGATACGTGGACT
ATGTGAGTCGTGAACTGTCCTATAAAGTCCAGGTGTCTGAGTTGTCTTGGAGTGGCTTTGTCAGCCTTTTGCCGGAAGGC
GAATATGAGCGGTTTGTGATCGGTCTTGATGATTAACTGTCGGCCTTCTACCATAAATCTGAAGAATTTCAAACTGGTGT
AAATAGCTAGTAGTTCACGATCGTACGTGCTATAACTCGACTGAGCGTTGCTGAATTTCTTTGAAAAGAATCCTAGTGGC
TCCCAGCAACCATTATTGTGTTGTTCTAGCACGGCACCCATTGCGGTATCCGATGCATCGGTGTAAAGAGCTAATGGTGC
ATTATCTTTAGGATGGTTTAGTAAAGAAGCTGTAGTCAGTTGTTGCTTGCATTGTTCAAAAGCTTCTTTAAGTTCATCCG
TCCAGTTGATGGGGCGTTTGTCACGTTTCTTAGCGCCAGCTAGTAATGCATGAAGTGGTGTTTGTGTAGAGGCAGCATTT
CGGATAAAACGCCTATAGAAATTGATAACGCCTAGAAAACGTCGCATGTCTGCAATGGTAGCTGGTTGAGGATATTCCTG
TATTGAGGCTACTCGTTCGGGTAACGGTCTGGTACCTTCACCATTTATCAGGTATCCCAGATagttaatttcagattttc
caAACTGGCACTTGGCGATATTAATGGATATGCCGTATTGTCTCAGTCGATTGAAGACTTGAGTGAGATGTTCTATGTGT
TCTTCAAGAGTTGATGAAGCTACAAGTATGTCGTCAATATAACAAAAGACAAACTTGAAGTCGTGTAGCACTAGATTCAT
AAATCGTTGGAAAGTCTGAGCTGCATTACATAATCCAAATGTCATTACGTTGAACTCGAATAAACCAAATGGCGTTATGA
TGGCGGTTTTGGGACGATCTTCTGGTAGTACAGGTATTTGATTATATGCCCTTACAAGATCTAGTGAACTGAATATTTTG
GTGCCTTGCAATTTGTGTGCAAAATCTTGAATGTGAGCAATTGGATATTTGTCTGGTAGGGTGACACTGTTAAGACGACG
ATAATCACCACAAGGGCGCCAGTctccatttttctttggtaCTAAGTGTAGTGGGCTGGCCCAAGGACTATTTGATGGAC
TGCACATACCTTGTTCTACCATAAAATTGAATTCCGCTTCTGCTAGCTTCAATTTTTCGGGAGACAGCCTTCTAGCTCTG
TCAGCTAGGGGTGGACCAGTAGTCTCTATGTGGTGGTAGATTCCATGTGAAGGATTTAAGATGCTTGGTTTGGAAGGAAC
AGttaaatctgcaaatttgtccaaaagttttttaaatggagtATTTCCGGATATAGTGCATATATTTCCTTGTGGGTACG
CAAATATTGCTCCTTTACTATTTAAAGTTGTGGTGCTGTCGATgagctttttgtttttaagatcTACCAATAGTCCAAAG
TGATTGAGAAAATCTGCTCCGAGCATGGGACGTGAAACATCTGCTATCGTGAATGGCCAACGAAAGAGGCGGCGTAGGCC
GAGATCCACGTTTAGTAATTGCTGTCCATAAGTATTTATTTCTGTGTTGTTGGCTGCGTATAGTTTGTAGTTCGATGCGソフトマスクはされていそう
kosukesano@at138:~/tools/for_softmask/nama_data/Dfro_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a ncbi_dataset/data/GCA_040113315.1/GCA_040113315.1_ASM4011331v1_genomic.fna
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
ncbi_dataset/data/GCA_040113315.1/GCA_040113315.1_ASM4011331v1_genomic.fna FASTA DNA 373 173,601,287 1,003 465,419 42,498,342 1,754 3,607 18,959 0 24,829,404 0 0 36.62
kosukesano@at138:~/tools/for_softmask/nama_data/Dfro_data$ コンティグ数373とめちゃくちゃに繋がってて草。これをBRAKERの生データディレクトリにコピーする。
kosukesano@at138:~/tools/for_braker/nama_data$ cp ~/tools/for_softmask/nama_data/Dfro_data/ncbi_dataset/data/GCA_040113315.1/GCA_040113315.1_ASM4011331v1_genomic.fna Dfro.fna
kosukesano@at138:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta Elaeidobius_kamerunicus.masked.fna Pst_NotUseEDTA_upper5000.fna femo_busco.sh.o26221930 kohuki_busco.sh.po26238968
241017_Ojiro_masked.fa GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz Sfem_RNAseq femo_busco.sh.pe26221930 kohuki_softmasked.fasta
BUSCO_OUTPUT_FEMO_GENOME Madara_RNAseq Sfem_pilon_softmasked.fasta femo_busco.sh.po26221930 kohuki_softmasked_upper1000.fasta
BUSCO_OUTPUT_KOHUKI_GENOME Ojiro_RNAseq Sfem_softmasked.fasta kohuki_busco.sh length.txt
Dfro.fna Pst_NotUseEDTA.fna busco_downloads kohuki_busco.sh.e26238968 madaralength.txt
Ekam_NotUseEDTA.fna Pst_NotUseEDTA_upper1000.fna femo_busco.sh kohuki_busco.sh.o26238968
Ekam_oomoji.fna Pst_NotUseEDTA_upper10000.fna femo_busco.sh.e26221930 kohuki_busco.sh.pe26238968
kosukesano@at138:~/tools/for_braker/nama_data$~/tools/for_braker/DfroでDfro_braker.shを作成、実行した。
### Dfro_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Dfro.fna\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--threads=16\
--species=Dfro\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
dateDvalについて
>JAJTJO010000001.1 Dendroctonus valens isolate Dva2017 scaffold_0, whole genome shotgun sequence
CCATAGTAGTAGATATTCATAATAAGTAGCAAAAAGGATCAGTCTACTCACATAGTAAGTCTACATTTCGAGAAACAAAA
TTGGAGAAGCTTTGAGGTAAACCAGTGGATGCGAATATGTGATAGATTAATTCCTGTTTGAGTTTTGCCTGAGAAAAGCG
TCGAAgctgaaagaaaaattgaaatccGTTATGAGACCAAACGCTCATAACCCACTTTAACCACATGAATACTAATTTAA
ATGCGCTACTACCTGAATTACCATTAGACCACAGCTGAAACGTTACGGTAACCATCGCTGGAAAATAGACAACATTTAGA
TGTAAACTACCACTTTCGACACACCCAGCTTGAATTCCGCATCCCATGAAGAACCGTCTCGATAAATTACTATTTCGGAA
TGCTTGATTTCTGTTAATGCAATTTGTCTCCCCGACATTAATTCTACGGAAATCGGTTGCAGATTTATTTCGAAATTGTT
GTATGCTCAAGCTGAAGACAAGAGAGATATTTTTGTCTGTCTCGTGTCTTTAAGATTAGCCGATTTTGTTTGATGTCACT
CGGCGCTTTAAATTTATGTGCCTCAAGATGTTCAAACACATTTATTGGACTGCAGTATTTTTCTCATGTTGCAATAAAAC
GCGAGATAAGATTGCAGCAAATCGGGCAATTGttgattcaaaaaattgttccCATACAAAACACcgattataaattaaat
tgtttttaaaaattattgtttgctGACTTCACAAAGGAAAATACTTTCTTACATATCTATAACCAAAACTTCTCGGAAGT
TAGTCTCAAATAGTCGTGGAAATGTGTAACTTCTTTTTTGTGTGCAACTTTAAAGCaaaatttcttgttttttgtAGGTCこっちもソフトマスクされているっぽい。
kosukesano@at138:~/tools/for_softmask/nama_data/Dval_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a ncbi_dataset/data/GCA_024550625.1/GCA_024550625.1_ASM2455062v1_genomic.fna
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
ncbi_dataset/data/GCA_024550625.1/GCA_024550625.1_ASM2455062v1_genomic.fna FASTA DNA 922 322,406,506 905 349,681.7 9,688,036 9,173 21,821 258,314 0 1,658,008 0 0 36.67
kosukesano@at138:~/tools/for_softmask/nama_data/Dval_data$min=905が若干不安だけど、多分いけんべ
~/tools/for_braker/DvalでDval_braker.shを作成、qsubで投げた。
### Dval_braker.sh
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/Dval.fna\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--threads=16\
--species=Dval\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
datePissodes strobiゲノムの短いコンティグを除去
現状のデータはこんな感じ
kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Pst_NotUseEDTA.fna
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
Pst_NotUseEDTA.fna FASTA DNA 84,140 2,025,024,129 201 24,067.3 2,554,738 2,904 6,869 16,574 0 105,159 0 0 32.01全長が2,025,024,129bp(2Gbp)で、コンティグ数が84,140。
これについてseqkitを使ってPstrの短いコンティグを削る。どれくらい削ればいいかわからなかったので、とりあえず3パターン作ってみた
1000bp未満を削ったものkosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 1000 Pst_NotUseEDTA.fna > Pst_NotUseEDTA_upper1000.fna [WARN] you may switch on flag -g/--remove-gaps to remove spaces kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Pst_NotUseEDTA_upper1000.fna file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%) Pst_NotUseEDTA_upper1000.fna FASTA DNA 83,427 2,024,625,705 1,000 24,268.2 2,554,738 2,970 6,975 16,735.5 0 105,221 0 0 32.01全長が
2,024,625,705bp(99.9%)、コンティグ数が83,427(99.1%)に。これを使って
BRAKERを実行kosukesano@at138:~/tools/for_braker/Pstr/upper_1_k$ ls Pstr_1k_braker.sh5000bp未満を削ったものkosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 5000 Pst_NotUseEDTA.fna > Pst_NotUseEDTA_upper5000.fna [WARN] you may switch on flag -g/--remove-gaps to remove spaces kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Pst_NotUseEDTA_upper5000.fna file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%) st_NotUseEDTA_upper5000.fna FASTA DNA 50,349 1,939,201,939 5,000 38,515.2 2,554,738 8,113 13,479 30,372 0 115,418 0 0 31.86全長が
1,939,201,939bp(95.7%)、コンティグ数が50,349(59.8%)に。10000bp未満を削ったものkosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 10000 Pst_NotUseEDTA.fna > Pst_NotUseEDTA_upper10000.fna [WARN] you may switch on flag -g/--remove-gaps to remove spaces kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Pst_NotUseEDTA_upper10000.fna file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%) Pst_NotUseEDTA_upper10000.fna FASTA DNA 32,051 1,806,682,273 10,000 56,369 2,554,738 14,212 22,767 52,547.5 0 131,293 0 0 31.61全長が
1,806,682,273bp(89.2%)、コンティグ数が32,051(38.1%)に。
コフキゲノムの短いコンティグを除去
kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a kohuki_softmasked.fasta
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
kohuki_softmasked.fasta FASTA DNA 2,372,896 3,664,337,660 48 1,544.2 151,585 86 100 363 0 15,058 0 0 32.29とりあえず1000bp未満を切り捨ててみる
kosukesano@at138:~/tools/for_braker/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a kohuki_softmasked_upper1000.fasta
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
kohuki_softmasked_upper1000.fasta FASTA DNA 397,892 3,349,012,532 1,000 8,416.9 151,585 1,967 4,187 10,292 0 17,050 0 0 32.27総配列数は3,349,012,532bp(91.4%)、コンティグ数は397,892(16.8%)になった。コンティグ数がめちゃくちゃに減った。
これを使ってBRAKERをかけてみる
1031
オジロを加えた7種でのCAFE結果
241019のCAFEの結果をローカルに移した。
:~/bio/for_cafe$ mkdir 241019_cafe_original_data
:~/bio/for_cafe$ cd 241019_cafe_original_data/
:~/bio/for_cafe/241019_cafe_original_data$ pwd
/Users/kosukesano/bio/for_cafe/241019_cafe_original_data
:~/bio/for_cafe/241019_cafe_original_data$ scp -r kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/241019_plusOjiro/results /Users/kosukesano/bio/for_cafe/241019_cafe_original_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Base_clade_results.txt 100% 209 11.5KB/s 00:00
Base_asr.tre 100% 1870KB 3.5MB/s 00:00
Base_count.tab 100% 293KB 3.8MB/s 00:00
Base_results.txt 100% 160 8.8KB/s 00:00
Base_family_likelihoods.txt 100% 153KB 4.0MB/s 00:00
Base_family_results.txt 100% 144KB 3.7MB/s 00:00
Base_branch_probabilities.tab 100% 72KB 2.5MB/s 00:00
Base_change.tab 100% 398KB 4.3MB/s 00:00
:~/bio/for_cafe/241019_cafe_original_data$ lines = readLines("/Users/kosukesano/bio/for_cafe/241019_cafe_original_data/results/Base_asr.tre")
# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", lines))
trees_end <- which(grepl("END;", lines))
trees_lines <- lines[(trees_start + 1):(trees_end - 1)]
# 不要なスペースを削除
trees_lines <- gsub("^\\s+|\\s+$", "", trees_lines)
# データフレームに変換
library(tibble)
trees_df = tibble(Tree = trees_lines)
ex=trees_df|>###各枝で優位に増減したOGをTRUE/FALSEで表す
#lines|>
tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>#系統樹の文字列をOG番号の列とツリーの列に分割
dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>#OG番号の列の余計な文字を除去
dplyr::mutate(Smad = stringr::str_detect(tree, pattern="Smad<1>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)を検出
dplyr::mutate(Ojiro = stringr::str_detect(tree, pattern="Ojiro<0>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)を検出
dplyr::mutate(Smad_Ojiro = stringr::str_detect(tree, pattern="<2>\\*_")) |>
dplyr::mutate(Cass = stringr::str_detect(tree, pattern="<3>\\*_")) |>
dplyr::mutate(Ojiro_Cass = stringr::str_detect(tree, pattern="<4>\\*_")) |>
dplyr::mutate(Dpon = stringr::str_detect(tree, pattern="<5>\\*_")) |>
dplyr::mutate(Cass_Dpon = stringr::str_detect(tree, pattern="<6>\\*_")) |>
dplyr::mutate(Agra = stringr::str_detect(tree, pattern="<7>\\*_")) |>
dplyr::mutate(Curculionidae = stringr::str_detect(tree, pattern="<8>\\*_")) |>
dplyr::mutate(Sory = stringr::str_detect(tree, pattern="<9>\\*_")) |>
dplyr::mutate(Curculionoidea = stringr::str_detect(tree, pattern="<10>\\*_")) |>
dplyr::mutate(Tcas = stringr::str_detect(tree, pattern="<11>\\*_")) |>
dplyr::mutate(all = stringr::str_detect(tree, pattern="<12>\\*_"))|>
print()# A tibble: 8,318 × 15
OG_num tree Smad Ojiro Smad_Ojiro Cass Ojiro_Cass Dpon Cass_Dpon Agra
<chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 OG0000000 (Tca… FALSE FALSE FALSE TRUE TRUE TRUE TRUE TRUE
2 OG0000002 (Tca… TRUE FALSE FALSE TRUE TRUE TRUE FALSE TRUE
3 OG0000005 (Tca… FALSE TRUE FALSE TRUE FALSE TRUE FALSE TRUE
4 OG0000006 (Tca… FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
5 OG0000007 (Tca… TRUE FALSE TRUE TRUE FALSE TRUE FALSE FALSE
6 OG0000008 (Tca… TRUE TRUE FALSE TRUE FALSE TRUE TRUE TRUE
7 OG0000010 (Tca… FALSE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
8 OG0000011 (Tca… TRUE FALSE FALSE TRUE FALSE TRUE FALSE FALSE
9 OG0000012 (Tca… FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
10 OG0000013 (Tca… FALSE TRUE FALSE TRUE FALSE FALSE FALSE FALSE
# ℹ 8,308 more rows
# ℹ 5 more variables: Curculionidae <lgl>, Sory <lgl>, Curculionoidea <lgl>,
# Tcas <lgl>, all <lgl>
### マダラでのみ有意なものを抽出
Smad_ex = ex|>
dplyr::filter(Smad == "TRUE")|>
dplyr::filter(Ojiro == "FALSE")|>
dplyr::filter(Smad_Ojiro == "FALSE")|>
dplyr::filter(Cass == "FALSE")|>
dplyr::filter(Ojiro_Cass == "FALSE")|>
dplyr::filter(Dpon == "FALSE")|>
dplyr::filter(Cass_Dpon == "FALSE")|>
dplyr::filter(Agra == "FALSE")|>
dplyr::filter(Curculionidae == "FALSE")|>
dplyr::filter(Sory == "FALSE")|>
dplyr::filter(Curculionoidea == "FALSE")|>
dplyr::filter(Tcas == "FALSE")|>
dplyr::filter(all == "FALSE") |>
print()# A tibble: 19 × 15
OG_num tree Smad Ojiro Smad_Ojiro Cass Ojiro_Cass Dpon Cass_Dpon Agra
<chr> <chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 OG0000158 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2 OG0000182 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
3 OG0000378 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
4 OG0000440 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
5 OG0000479 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
6 OG0000567 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
7 OG0000684 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
8 OG0001789 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
9 OG0002093 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
10 OG0002847 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
11 OG0002850 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
12 OG0003493 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
13 OG0003527 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
14 OG0006333 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
15 OG0006392 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
16 OG0007730 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
17 OG0008099 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
18 OG0009924 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
19 OG0011431 (Tca… TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# ℹ 5 more variables: Curculionidae <lgl>, Sory <lgl>, Curculionoidea <lgl>,
# Tcas <lgl>, all <lgl>
Bc=read.csv("/Users/kosukesano/bio/for_cafe/241019_cafe_original_data/results/Base_change.tab", sep="\t")
Smad_bc=Bc |>###マダラで増加した0Gの0G番号を抽出したファイル
dplyr::select("FamilyID","Smad.1.") |>#OG番号の列とマダラでの遺伝子数の増減量が書いてある列のみを抽出
dplyr::mutate(Smad.1.= stringr::str_extract(Smad.1., r"(^\d+)")) |>#マダラの遺伝子量増減の列のうち、数字のみのもの(-がついておらず、遺伝子数が増加しているもの)を抽出
tidyr::drop_na()|>
dplyr::filter(Smad.1. != 0) #遺伝子数の増加分が0のものを除去
Smad_df=dplyr::inner_join(Smad_bc, Smad_ex, by = c(FamilyID = "OG_num"))|>##マダラで優位に増加したOGのOG番号を抽出したファイル
print() FamilyID Smad.1.
1 OG0000479 19
2 OG0000567 5
3 OG0000684 5
4 OG0001789 2
5 OG0002093 6
6 OG0002847 9
7 OG0002850 10
8 OG0003493 7
9 OG0003527 8
10 OG0006333 4
11 OG0006392 4
12 OG0007730 5
13 OG0008099 5
14 OG0009924 3
15 OG0011431 3
tree
1 (Tcas<11>_1:236.2,(Sory<9>_1:133.223,(Agra<7>_2:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_20:78.9022,Ojiro<0>_1:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
2 (Tcas<11>_5:236.2,(Sory<9>_4:133.223,(Agra<7>_2:112.172,(Dpon<5>_1:101.931,(Cass<3>_1:92.1677,(Smad<1>*_8:78.9022,Ojiro<0>_3:78.9022)<2>_3:13.2655)<4>_3:9.76356)<6>_3:10.2402)<8>_3:21.0518)<10>_3:102.977)<12>_4;
3 (Tcas<11>_3:236.2,(Sory<9>_1:133.223,(Agra<7>_1:112.172,(Dpon<5>_4:101.931,(Cass<3>_2:92.1677,(Smad<1>*_8:78.9022,Ojiro<0>_3:78.9022)<2>_3:13.2655)<4>_3:9.76356)<6>_3:10.2402)<8>_3:21.0518)<10>_3:102.977)<12>_3;
4 (Tcas<11>_6:236.2,(Sory<9>_3:133.223,(Agra<7>_3:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_3:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_2:21.0518)<10>_2:102.977)<12>_3;
5 (Tcas<11>_4:236.2,(Sory<9>_0:133.223,(Agra<7>_1:112.172,(Dpon<5>_1:101.931,(Cass<3>_0:92.1677,(Smad<1>*_7:78.9022,Ojiro<0>_1:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_2;
6 (Tcas<11>_1:236.2,(Sory<9>_1:133.223,(Agra<7>_0:112.172,(Dpon<5>_1:101.931,(Cass<3>_0:92.1677,(Smad<1>*_10:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
7 (Tcas<11>_2:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_11:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
8 (Tcas<11>_2:236.2,(Sory<9>_0:133.223,(Agra<7>_1:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_8:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
9 (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_1:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_9:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
10 (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_2:92.1677,(Smad<1>*_5:78.9022,Ojiro<0>_1:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
11 (Tcas<11>_2:236.2,(Sory<9>_2:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_5:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
12 (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_1:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_6:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
13 (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_1:101.931,(Cass<3>_0:92.1677,(Smad<1>*_6:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
14 (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_2:92.1677,(Smad<1>*_4:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
15 (Tcas<11>_1:236.2,(Sory<9>_0:133.223,(Agra<7>_0:112.172,(Dpon<5>_0:101.931,(Cass<3>_0:92.1677,(Smad<1>*_4:78.9022,Ojiro<0>_0:78.9022)<2>_1:13.2655)<4>_1:9.76356)<6>_1:10.2402)<8>_1:21.0518)<10>_1:102.977)<12>_1;
Smad Ojiro Smad_Ojiro Cass Ojiro_Cass Dpon Cass_Dpon Agra Curculionidae
1 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
3 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
4 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
5 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
6 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
7 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
8 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
9 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
10 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
11 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
12 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
13 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
14 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
15 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
Sory Curculionoidea Tcas all
1 FALSE FALSE FALSE FALSE
2 FALSE FALSE FALSE FALSE
3 FALSE FALSE FALSE FALSE
4 FALSE FALSE FALSE FALSE
5 FALSE FALSE FALSE FALSE
6 FALSE FALSE FALSE FALSE
7 FALSE FALSE FALSE FALSE
8 FALSE FALSE FALSE FALSE
9 FALSE FALSE FALSE FALSE
10 FALSE FALSE FALSE FALSE
11 FALSE FALSE FALSE FALSE
12 FALSE FALSE FALSE FALSE
13 FALSE FALSE FALSE FALSE
14 FALSE FALSE FALSE FALSE
15 FALSE FALSE FALSE FALSE
15個の遺伝子ファミリーがマダラで特異的に増加した
これに機能アノテーションをつける
# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim("/Users/kosukesano/bio/for_cafe/241019_orthofinder_data/Orthogroups.tsv", header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V6")
Smad_df2=dplyr::left_join(Smad_df, orthogroups, by = c(FamilyID = "V1"))|>#マダラで有意に増加したOGのOG番号とマダラの遺伝子IDを紐付ける
dplyr::select(!c(Smad.1., tree))
Smad_df3 <- Smad_df2 %>%###マダラでのみ増加した遺伝子のgene_IDとOG番号
separate_rows(V6, sep = ", ") %>%
rename(gene_ID = V6, family_ID = FamilyID)|>
dplyr::mutate(gene_ID = stringr::str_replace(gene_ID, "^Smad_", ""))
fa<-read.csv("/Users/kosukesano/bio/functional_annotation/merged_with_gene_function.csv", sep=",")
Smad_df4=dplyr::left_join(Smad_df3, fa, by = c(gene_ID = "Madara"))|>###完成系
print()# A tibble: 114 × 23
family_ID Smad Ojiro Smad_Ojiro Cass Ojiro_Cass Dpon Cass_Dpon Agra
<chr> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl> <lgl>
1 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
2 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
3 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
4 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
5 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
6 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
7 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
8 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
9 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
10 OG0000479 TRUE FALSE FALSE FALSE FALSE FALSE FALSE FALSE
# ℹ 104 more rows
# ℹ 14 more variables: Curculionidae <lgl>, Sory <lgl>, Curculionoidea <lgl>,
# Tcas <lgl>, all <lgl>, gene_ID <chr>, Ecoli <chr>, Ecol_GeneFunction <chr>,
# Dmelanogaster <chr>, Dmel_GeneFunction <chr>, Tcastaneum <chr>,
# Tcas_GeneFunction <chr>, Soryzae <chr>, Sory_GeneFunction <chr>
114個の遺伝子が抽出できた
2024年11月
ETEの導入
~/tools/for_ETE?ディレクトリを作成した。
本家サイト?によるとcondaで入れるといいと書いてあったが、ete3とete_toolchainのインストール時にパッケージの依存関係の競合が発生した
(ete3) kosukesano@at139:~/tools/for_ETE$ ete3 build check
Command 'ete3' not found, did you mean:
command 'etex' from deb texlive-binaries (2021.20210626.59705-1ubuntu0.1)
Try: apt install <deb name>
(ete3) kosukesano@at139:~/tools/for_ETE$ Pinned packages:
- python 3.8.*
Could not solve for environment specs
Encountered problems while solving:
- package ete_toolchain-3.0.0-h73706c9_0 requires pmodeltest 1.4.*, but none of the providers can be installed
The environment can't be solved, aborting the operation結局GitHubに書いてあった方法を使った。
(base) kosukesano@at138:~/tools/for_ETE$ pip install https://github.com/etetoolkit/ete/archive/ete4.zip
Collecting https://github.com/etetoolkit/ete/archive/ete4.zip
Downloading https://github.com/etetoolkit/ete/archive/ete4.zip
/ 4.3 MB 15.1 MB/s 0:00:00
Installing build dependencies ... done
Getting requirements to build wheel ... done
Preparing metadata (pyproject.toml) ... done
Collecting bottle (from ete4==4.0.0b2)
Downloading bottle-0.13.2-py2.py3-none-any.whl.metadata (1.8 kB)
Collecting brotli (from ete4==4.0.0b2)
Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (5.5 kB)
Collecting numpy (from ete4==4.0.0b2)
Downloading numpy-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.9/60.9 kB 2.0 MB/s eta 0:00:00
Collecting scipy (from ete4==4.0.0b2)
Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (60 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 60.8/60.8 kB 3.3 MB/s eta 0:00:00
Requirement already satisfied: requests in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from ete4==4.0.0b2) (2.28.1)
Requirement already satisfied: charset-normalizer<3,>=2 in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from requests->ete4==4.0.0b2) (2.1.1)
Requirement already satisfied: idna<4,>=2.5 in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from requests->ete4==4.0.0b2) (3.4)
Requirement already satisfied: urllib3<1.27,>=1.21.1 in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from requests->ete4==4.0.0b2) (1.26.13)
Requirement already satisfied: certifi>=2017.4.17 in /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/lib/python3.10/site-packages (from requests->ete4==4.0.0b2) (2024.2.2)
Downloading bottle-0.13.2-py2.py3-none-any.whl (104 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 104.1/104.1 kB 5.9 MB/s eta 0:00:00
Downloading Brotli-1.1.0-cp310-cp310-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.0 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 3.0/3.0 MB 61.7 MB/s eta 0:00:00
Downloading numpy-2.1.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 16.3/16.3 MB 83.5 MB/s eta 0:00:00
Downloading scipy-1.14.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (41.2 MB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 41.2/41.2 MB 53.6 MB/s eta 0:00:00
Building wheels for collected packages: ete4
Building wheel for ete4 (pyproject.toml) ... done
Created wheel for ete4: filename=ete4-4.0.0b2-cp310-cp310-linux_x86_64.whl size=3142932 sha256=b89f676934222cd824353b76450b0b96d9e243646c2da6aea7d17b2dd479bcc6
Stored in directory: /tmp/pip-ephem-wheel-cache-ksa3wdi5/wheels/89/21/61/80025b2b6138108e4f3ee405a77c230502321e3e0a470f8492
Successfully built ete4
Installing collected packages: brotli, bottle, numpy, scipy, ete4
Successfully installed bottle-0.13.2 brotli-1.1.0 ete4-4.0.0b2 numpy-2.1.2 scipy-1.14.1
(base) kosukesano@at138:~/tools/for_ETE$ ただこれ間違えてmambaのbase環境でやっちゃったので、改めてete4の環境を作ってそちらでインストールした。
(base) kosukesano@at138:~/tools/for_ETE$ conda create -n ete4
Collecting package metadata (current_repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 22.9.0
latest version: 24.9.2
Please update conda by running
$ conda update -n base -c conda-forge conda
## Package Plan ##
environment location: /lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/envs/ete4
Proceed ([y]/n)? y
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
#
# To activate this environment, use
#
# $ conda activate ete4
#
# To deactivate an active environment, use
#
# $ conda deactivate
Retrieving notices: ...working... done
(base) kosukesano@at138:~/tools/for_ETE$ conda activate ete4
(ete4) kosukesano@at138:~/tools/for_ETE$ python -c "import ete4; print(ete4.__version__)"
Traceback (most recent call last):
File "<string>", line 1, in <module>
ModuleNotFoundError: No module named 'ete4'
(ete4) kosukesano@at138:~/tools/for_ETE$ pip install https://github.com/etetoolkit/ete/archive/ete4.zip
Defaulting to user installation because normal site-packages is not writeable
Collecting https://github.com/etetoolkit/ete/archive/ete4.zip
Downloading https://github.com/etetoolkit/ete/archive/ete4.zip
- 4.3 MB 13.6 MB/s 0:00:00
Installing build dependencies ... done
Getting requirements to build wheel ... done
Installing backend dependencies ... done
Preparing metadata (pyproject.toml) ... done
Building wheels for collected packages: ete4
Building wheel for ete4 (pyproject.toml) ... done
Created wheel for ete4: filename=ete4-0.0.0-cp310-cp310-linux_x86_64.whl size=3708108 sha256=62c7f65349abb0abd88fe4df93dfcd7c4c36fb3ae69ae7588a2863f40f725ba0
Stored in directory: /tmp/pip-ephem-wheel-cache-lfo6wso4/wheels/89/21/61/80025b2b6138108e4f3ee405a77c230502321e3e0a470f8492
Successfully built ete4
Installing collected packages: ete4
Successfully installed ete4-0.0.0実際に動かしてみる
(ete4) kosukesano@at138:~/tools/for_ETE$ python -c "import ete4; print(ete4.__version__)"
4.0.0-beta
(ete4) kosukesano@at138:~/tools/for_ETE$動いてるっぽい。
環境もセットで作ったのでETE環境立ち上げ用のプロファイルを~/tools/pyenv_envに作成する。
### ETE_profileの中身
source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
. "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
else
export PATH="/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
fi
fi
unset __conda_setup
if [ -f "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
. "/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<
conda activate ete4
python -c "import ete4; print(ete4.__version__)"実際に実行するとこんな感じ
kosukesano@at138:~/tools/pyenv_env$ source ETE_profile
4.0.0-beta
(ete4) kosukesano@at138:~/tools/pyenv_env$ 環境に入ると同時に4.0.0-betaって言ってくれる
試しにprint.pyを作成して実行してみる。ちなみにOG0008871.nwkは0930のASTRALの入力に使ったファイルから持ってきた。
### print.pyの中身
import ete4
print(ete4.__version__)
from ete4 import Tree
t = Tree(open('OG0008871.nwk'))
t.explore()(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ python print.py
4.0.0-beta
Traceback (most recent call last):
File "/lustre7/home/kosukesano/tools/for_ETE/test_241101/print.py", line 9, in <module>
t.explore()
File "ete4/core/tree.pyx", line 1095, in ete4.core.tree.Tree.explore
File "/home/kosukesano/.local/lib/python3.10/site-packages/ete4/smartview/gui/server.py", line 36, in <module>
from bottle import (
ModuleNotFoundError: No module named 'bottle'
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ こういうエラー。bottleをインストールしてもう一回
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ pip install bottle
Defaulting to user installation because normal site-packages is not writeable
Collecting bottle
Downloading bottle-0.13.2-py2.py3-none-any.whl (104 kB)
━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━ 104.1/104.1 KB 2.6 MB/s eta 0:00:00
Installing collected packages: bottle
Successfully installed bottle-0.13.2
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ python print.py
4.0.0-beta
https://github.com/etetoolkit/ete-data/raw/main/layouts/pfam2color.json -> /home/kosukesano/.local/share/ete/pfam2color.json
https://github.com/etetoolkit/ete-data/raw/main/layouts/smart2color.json -> /home/kosukesano/.local/share/ete/smart2color.json
Added tree tree-1 with id 0.
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ lsなんか挙動が変わった。
1104
DfroのBRAKER結果
kosukesano@at139:~/tools/for_braker/Dfro$ ls
Dfro_braker.sh Dfro_braker.sh.e27165165 Dfro_braker.sh.o27165165 Dfro_braker.sh.pe27165165 Dfro_braker.sh.po27165165 braker gpu
kosukesano@at139:~/tools/for_braker/Dfro$ ls braker/
Augustus GeneMark-EP GeneMark-ES braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff prothint.gff species what-to-cite.txt
kosukesano@at139:~/tools/for_braker/Dfro$ cd braker/
kosukesano@at139:~/tools/for_braker/Dfro/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 19,978 8,071,872 2 404 18,409
kosukesano@at139:~/tools/for_braker/Dfro/braker$ ちゃんとできた。
Orthofinderのためにヘッダーを書き換えておく。
kosukesano@at138:~/tools/for_braker/Dfro/braker$ ls
Augustus GeneMark-EP GeneMark-ES braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff prothint.gff species what-to-cite.txt
kosukesano@at138:~/tools/for_braker/Dfro/braker$ cp braker.aa ../Dfro.fasta
kosukesano@at138:~/tools/for_braker/Dfro/braker$ cd ../
kosukesano@at138:~/tools/for_braker/Dfro$ ls
Dfro.fasta Dfro_braker.sh Dfro_braker.sh.e27165165 Dfro_braker.sh.o27165165 Dfro_braker.sh.pe27165165 Dfro_braker.sh.po27165165 braker gpu
kosukesano@at138:~/tools/for_braker/Dfro$ edit.py
edit.py: command not found
kosukesano@at138:~/tools/for_braker/Dfro$ nano edit.py
kosukesano@at138:~/tools/for_braker/Dfro$ python edit.py
../Dfro/RemakeHedder_Dfro/Dfro.fasta に保存しました。
kosukesano@at138:~/tools/for_braker/Dfro$ ls
Dfro.fasta Dfro_braker.sh Dfro_braker.sh.e27165165 Dfro_braker.sh.o27165165 Dfro_braker.sh.pe27165165 Dfro_braker.sh.po27165165 RemakeHedder_Dfro braker edit.py gpu
kosukesano@at138:~/tools/for_braker/Dfro$ ls RemakeHedder_Dfro/
Dfro.fasta
kosukesano@at138:~/tools/for_braker/Dfro$ DvalのBRAKER結果
kosukesano@at139:~/tools/for_braker/Dval$ ls
Dval_braker.sh Dval_braker.sh.e27165208 Dval_braker.sh.o27165208 Dval_braker.sh.pe27165208 Dval_braker.sh.po27165208 braker gpu
kosukesano@at139:~/tools/for_braker/Dval$ ls braker/
Augustus GeneMark-EP GeneMark-ES braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff prothint.gff species what-to-cite.txt
kosukesano@at139:~/tools/for_braker/Dval$ cd braker/
kosukesano@at139:~/tools/for_braker/Dval/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 37,002 12,331,785 2 333.3 21,873
kosukesano@at139:~/tools/for_braker/Dval/braker$こっちもできてそう。これもedit.pyを使ってヘッダー書き換えた。
PstrのBRAKERやり直し
1000bp未満を切ったやつは終わってた。他はまだ。
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ ls
GeneMark-EP braker.gtf_temp gc_content.out nuc.fasta train.gb
GeneMark-EP.stdout braker.log gene_stat.yaml optimize_augustus.stdout train.gb.test
GeneMark-ES cmd.log genemark_evidence.gff prevHints.gff train.gb.train
GeneMark-ES.stdout diamond genemark_hintsfile.gff proteins.fa train.gb.train.test
Spaln downsample_traingenes.log genome.fa prothint.gff train.gb.train.train
aa2nonred.stdout ensure_min_n_training_genes.stdout genome.fa.cidx prothint_augustus.gff traingenes.good.fa
aug_hints.lst errors genome_header.map protl4pnvjoe traingenes.good.gtf
augustus.hints.tmp.gtf etrain.bad.lst genome_split secondetraining.stdout traingenes.good.nr.fa
augustus.hints_iter1.aa evidence.gff getAnnoFastaFromJoingenes.augustus.hints_hints.stdout secondtest.stdout traingenes.gtf
augustus.hints_iter1.codingseq filterGenemark.stdout getAnnoFastaFromJoingenes.augustus.hints_tmp.stdout seed_proteins.faa uniqueSeeds.gtf
augustus.hints_iter1.gff firstetraining.stdout getAnnoFastaFromJoingenes.braker_.stdout species what-to-cite.txt
augustus.hints_iter1.gtf firsttest.stdout good_genes.lst tmp_opt_Pstr_1k
braker.aa fix_IFS_log_hoefclri hints.job.lst top_chains.gff
braker.codingseq fix_in_frame_stop_codon_genes_augustus.hints.log hintsfile.gff train.f.gb
braker.gtf gbFilterEtraining.stdout nonred.loci.lst train.ff.gb
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 221,240 51,377,361 2 232.2 11,401
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ なんか出力ファイル多くね?でもできてそうではある。
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$ cp braker.aa ../Pstr_upper1k.fasta
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k/braker$
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k$ python edit.py
../upper_1_k/RemakeHedder_Pstr/Pstr_upper1k.fasta に保存しました。
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k$ ls
Pstr_1k_braker.sh Pstr_1k_braker.sh.e27173011 Pstr_1k_braker.sh.o27173011 Pstr_1k_braker.sh.pe27173011 Pstr_1k_braker.sh.po27173011 Pstr_upper1k.fasta RemakeHedder_Pstr braker edit.py
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k$ less RemakeHedder_Pstr/Pstr_upper1k.fasta
kosukesano@at139:~/tools/for_braker/Pstr/upper_1_k$edit.pyを使ってヘッダー書き換え。
合計10種でのOrthofinder
~/tools/for_orthofinder/241104_10spを作成、そこに10種分の.fastaファイルをコピーした。
kosukesano@at138:~/tools/for_orthofinder$ mkdir 241104_10sp
kosukesano@at138:~/tools/for_orthofinder$ ls
241019_6plusOjiro RemakeHedder_6sp Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26310331
241104_10sp Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.pe26291666
CO1_6sp Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.pe26310331
Orthofinder_240917_RH.sh Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.po26291666
Orthofinder_240917_RH.sh.e26802366 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.po26310331
Orthofinder_240917_RH.sh.o26802366 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.e26224546 make_philo_tree
Orthofinder_240917_RH.sh.pe26802366 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.o26224546 seven_sp.sh
Orthofinder_240917_RH.sh.po26802366 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.pe26224546 seven_sp.sh.e26639936
Orthofinder_241019.sh Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.po26224546 seven_sp.sh.o26639936
Orthofinder_241019.sh.e27076911 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh seven_sp.sh.pe26639936
Orthofinder_241019.sh.o27076911 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.e26291666 seven_sp.sh.po26639936
Orthofinder_241019.sh.pe27076911 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.e26310331
Orthofinder_241019.sh.po27076911 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26291666
kosukesano@at138:~/tools/for_orthofinder$ cd 241104_10sp/
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ cp ~/tools/for_braker/Dfro/RemakeHedder_Dfro/Dfro.fasta ../241104_10sp/
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ cp ~/tools/for_braker/Dval/RemakeHedder_Dval/Dval.fasta ../241104_10sp/
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ cp ~/tools/for_braker/Pstr/upper_1_k/RemakeHedder_Pstr/Pstr_upper1k.fasta ../241104_10sp/Pstr.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ ls
Dfro.fasta Dval.fasta Pstr.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ cp ../241019_6plusOjiro/*.fasta ../241104_10sp/
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ ls
Agra.fasta Cass.fasta Dfro.fasta Dpon.fasta Dval.fasta Ojiro.fasta Pstr.fasta Smad.fasta Sory.fasta Tcas.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_10sp$ ここでOrthofinder_241104.shを作成、qsubで投げた。
### Orthofinder_241104.sh
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l intel
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
-f /home/kosukesano/tools/for_orthofinder/241104_10sp\
-t 16
dateマダラを抜いてオジロを入れた6種でのOrthofinder
~/tools/for_orthofinder/241104_5plusOjiroを作成、そこに以下の方法で6種のゲノムをコピーした。
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ cp ../241019_6plusOjiro/*fasta ../241104_5plusOjiro/
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ ls
Agra.fasta Cass.fasta Dpon.fasta Ojiro.fasta Smad.fasta Sory.fasta Tcas.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ rm Smad.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ ls
Agra.fasta Cass.fasta Dpon.fasta Ojiro.fasta Sory.fasta Tcas.fasta
kosukesano@at138:~/tools/for_orthofinder/241104_5plusOjiro$ ここでOrthofinder_241104_5plusOjiro.shを作成し、qsubで実行した。
こっちの方が早く終わった。
マダラを抜いてオジロを入れた6種でのIQTREE
~/tools/for_IQTREE/241104_5sp_plusOjiroを作成、その下でIQTREE_1.pyを実行した。
### IQTREE_1.py
##analysis_manual.pptxの#46も参照
##AFTER you made MSA file(all_seq.fa) in DDBJ with makeMSA.sh
##時間は10secほど
import numpy as np
import pandas as pd
import os
path = "/home/kosukesano/tools/for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/"
withpath = "../../for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/"
OGs = pd.read_table(path + "Orthogroups/Orthogroups.tsv")
# ManualPhylo_dataディレクトリが存在しない場合は作成
os.makedirs(path + "ManualPhylo_data", exist_ok=True)
##with openは相対パスしか受け付けないらしい
new = pd.DataFrame()
with open(withpath + "Orthogroups/Orthogroups_SingleCopyOrthologues.txt", "r") as fin:
for line in fin:
li = line.rstrip()
new = pd.concat([new, OGs[OGs["Orthogroup"] == li]])
print(new)
new.to_csv(path + "ManualPhylo_data/OG_list.txt", sep = " ", index = False, header = False)
##OG_list.txtと同じ順番の種名リストであるspecies_list.txtを作成
##できたOG_list.txtに、DDBJで作ったall_seq.faで配列情報を与える。
li = []
allspe = OGs.columns.tolist()
allspe2 = allspe[1:len(allspe)]
with open(withpath + "ManualPhylo_data/species_list.txt", "w") as file:
for column_name in allspe2:
file.write("%s\n" % column_name)実行するとこんな感じ。
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ python IQTREE_1.py
Orthogroup Agra Cass Dpon Ojiro Sory Tcas
8088 OG0008088 Agra_P_050292700.1 Cass_AG9761214.1 Dpon_P_019755574.2 Ojir_g1996.t1 Sory_P_030761209.1 Tcas_P_008195282.1
8089 OG0008089 Agra_P_050292731.1 Cass_AH1135743.1 Dpon_P_048519923.1 Ojir_g7978.t1 Sory_P_030765758.1 Tcas_P_008196870.1
8090 OG0008090 Agra_P_050292732.1 Cass_AG9767756.1 Dpon_P_019773495.1 Ojir_g6189.t1 Sory_P_030765067.1 Tcas_P_015836383.1
8091 OG0008091 Agra_P_050292739.1 Cass_AG9768060.1 Dpon_P_019769194.2 Ojir_g6137.t1 Sory_P_030755089.1 Tcas_P_969265.1
8092 OG0008092 Agra_P_050292743.1 Cass_AG9767942.1 Dpon_P_019767966.1 Ojir_g4737.t1 Sory_P_030750408.1 Tcas_P_971491.1
... ... ... ... ... ... ... ...
10018 OG0010018 Agra_P_050316302.1 Cass_AG9767812.1 Dpon_P_019773553.2 Ojir_g6202.t1 Sory_P_030765225.1 Tcas_P_008194975.1
10021 OG0010021 Agra_P_050316346.1 Cass_AG9766968.1 Dpon_P_019768198.1 Ojir_g7446.t1 Sory_P_030747218.1 Tcas_P_015834541.1
10023 OG0010023 Agra_P_050316372.1 Cass_AG9765979.1 Dpon_P_019758814.2 Ojir_g9692.t1 Sory_P_030763414.1 Tcas_P_971352.1
10026 OG0010026 Agra_P_050316407.1 Cass_AG9762302.1 Dpon_P_019758828.1 Ojir_g9703.t1 Sory_P_030763403.1 Tcas_P_008196032.1
10027 OG0010027 Agra_P_050316412.1 Cass_AG9765463.1 Dpon_P_019755650.2 Ojir_g11721.t1 Sory_P_030763780.1 Tcas_P_008199734.2
[1480 rows x 7 columns]
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ 続いてconcatinate.shを作成、実行した。
### concatinate.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_orthofinder/241104_5plusOjiro" ## Please replace with the actual directory containing the fasta files
# Define the output directory and output file
new="/home/kosukesano/tools/for_IQTREE/241104_5sp_plusOjiro"
mkdir -p $new
# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
cat "$file" >> "${new}/all_seq.fa"
done
dateこれを実行するとこんな感じ
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ nano concatinate.sh
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ sh concatinate.sh
start at
Mon Nov 4 18:48:00 JST 2024
Mon Nov 4 18:48:01 JST 2024
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ ls
IQTREE_1.py all_seq.fa concatinate.sh
kosukesano@at138:~/tools/for_IQTREE/241104_5sp_plusOjiro$ 次にIQTREE_2.pyを作り、実行。
### IQTREE_2.pyの中身
import sys
from Bio import SeqIO
path = "../../for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/ManualPhylo_data/"
fasta_in = sys.argv[1] #1番目の引数には上記のall_seq.faなどfastaファイルを指定する
query_in = sys.argv[2] #2番目の引数には上記のOG_list.txtなどオーソログファイルを指定する
for q in open(query_in, "r"): #オーソログファイルを開いて1行づつ読み込む
query = q.split() #スペース毎に切りとってリスト形式でqueryに保存する
f = open(path + query[0], 'w') #最初の列(OG名)と同じ名前のファイルを作成する
for record in SeqIO.parse(fasta_in, 'fasta'): #fastaファイルを開くSeqIOを使ってパースする(1項目づつ読み込む)
id_part = record.id #fastaのID部分を読み込む
desc_part = record.description #fastaのdescription部分を読み込む
seq = record.seq #fastaの配列部分を読み込む
for i in range(len(query)): #オーソログファイル中の各OGに含まれる配列数を数えて、その分繰り返す(python2の人はrange を x rangeにする)
if desc_part == query[i] : #オーソログファイルの配列descriptionとfastaの配列descriptionが一致したら、、、
fasta_seq = '>' + desc_part + '\n' + seq + '\n' #fasta形式に整え
print(fasta_seq) #標準出力にfastaを出力(進行状況把握用)
f.write(str(fasta_seq)) #各OGファイルにfastaを出力
f.close()実行のコマンドは以下の通り
python IQTREE_2.py all_seq.fa ../../for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/ManualPhylo_data/OG_list.txt 結構時間かかる。
続いて~/tools/for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/ManualPhylo_dataに移動、そこでMPT環境に入る。
source ~/tools/pyenv_env/ManualPhilo_profile そこでalign.shをコピーして作成、実行した。
### align.shの中身
#!/bin/sh
#$ -S /bin/bash
#$ -cwd
#$ -v PATH
awk '{print($1)}' $1 | while read x; do #引数に前述のOG_list.txtなどのOGリストを指定する。
mafft --auto $x > $x.maffted.fa
trimal -in $x.maffted.fa -out $x.maffted.trimed.fa -keepheader -htmlout $x.maffted.trimed.fa.html -automated1
done実行のコマンドは以下の通り。
sh align.sh OG_list.txtこれもそこそこ時間がかかる
続いて、~/tools/for_IQTREE/241104_5sp_plusOjiroに移動し、IQTREE_3.pyを作成
### IQTREE_3.pyの中身
import os
# ファイルのヘッダーを変更する関数
def modify_headers(input_file, output_file):
with open(input_file, 'r') as infile, open(output_file, 'w') as outfile:
for line in infile:
if line.startswith(">"):
# ヘッダー行の最初の四文字を抽出して書き換え
outfile.write(f">{line[1:5]}\n")
else:
outfile.write(line)
# 作業ディレクトリ内のすべての ".maffted.trimed.fa" ファイルに対して処理を適用し、出力を別ディレクトリに保存
def process_directory(input_directory, output_directory):
if not os.path.exists(output_directory):
os.makedirs(output_directory)
for filename in os.listdir(input_directory):
if filename.endswith(".maffted.trimed.fa"):
input_file = os.path.join(input_directory, filename)
output_file = os.path.join(output_directory, filename.replace(".maffted.trimed.fa", ".maffted.trimed.edit.fa"))
modify_headers(input_file, output_file)
print(f"Processed: {filename}")
# 実行するディレクトリを指定
input_directory = "/home/kosukesano/tools/for_orthofinder/241104_5plusOjiro/OrthoFinder/Results_Nov04/ManualPhylo_data"
output_directory = "/home/kosukesano/tools/for_IQTREE/241104_5sp_plusOjiro"
process_directory(input_directory, output_directory)これを実行すると、*.maftted.trimed.edit.faファイルができる。
ここでmakerun.pyを作成。
### makerun.pyの中身
import glob
import os
list = []
for i in glob.glob('*.maffted.trimed.edit.fa'):
list.append(os.path.split(i)[1].rstrip())
#print(list[0])
##ls | grep "maffted.trimed.edit.fa" > otamesi.txtで、完成したOGをotamesi.txtに一行ずつ保存
##ファイルの行数をカウント。このカウント数がfor文のrangeに入る数になる
f = open("run.nex", "w")
f.write("#nexus" + "\n")
f.write("begin sets;" + "\n")
character = "charset part"
for line, i in zip(list, range(4997)):
row = character + str(i+1) + " = " + line + ": ;"
f.write("\t" + row + "\n")
f.write("end;" + "\n")
f.close()これでrun.nexファイルができる。これを使ってmanualphylo.shを実行。
### manualphylo.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
date
singularity exec -e /usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0 iqtree2 -sp run.nex -nt AUTO -bb 1000 -cptime 600
dateこれをqsubで投げた。
マダラを抜いてオジロを入れた6種でのASTRAL前準備
IQTREEと同じディレクトリで、*.maftted.trimed.edit.faファイルが揃った後に行う。
makealltree.shを作成し、実行。
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
echo start at
date
# Singularityイメージのパスを指定
SINGULARITY_IMAGE="/usr/local/biotools/i/iqtree:2.3.3--h21ec9f0_0"
# 作業ディレクトリに移動
cd /home/kosukesano/tools/for_IQTREE/241104_5sp_plusOjiro
# 出力ファイル
output_file="all_trees.nwk"
# 既存の出力ファイルを削除
if [ -f $output_file ]; then
rm $output_file
fi
# *.maffted.trimed.edit.fa ファイルを処理
for file in *.maffted.trimed.edit.fa; do
# ファイル名から拡張子を除いたベース名を取得
base_name=$(basename $file .maffted.trimed.edit.fa)
# Singularityを使用してIQ-TREEを実行して系統樹を作成
singularity exec -e $SINGULARITY_IMAGE iqtree2 -s $file -nt AUTO -bb 1000 -cptime 600 -pre ${base_name}
# 作成された系統樹ファイル (.treefile) を output_file に追加
if [ -f ${base_name}.treefile ]; then
echo -n "${base_name}: " >> $output_file
cat ${base_name}.treefile >> $output_file
echo "" >> $output_file
else
echo "Error: ${base_name}.treefile not found" >&2
fi
done
echo "All trees have been written to $output_file"
dateこれをqsubで投げた。
ETEの続き
ETE4のファイル読み込みなどについてわかったことのまとめ
### print.pyの中身
import ete3
print(ete4.__version__)
print(ete3.__version__)
from ete4 import Tree
t = Tree(open('OG0008871.nwk')) #ツリー読み込み
t.explore()
print(t) #ツリーのプリント
# ルートノードに子ノードを追加
t.add_child(name="child1")
t.add_child(name="child2")
print(t)
### エラーが起きたコード###
# child1ノードを削除
#child1 = t & "child1" # "child1"のノードにアクセス
#child1.detach() # ノードをツリーから削除
###
# "child1"という名前のノードにアクセスする
#child1 = t.search_nodes(name="child1")[0] # 検索結果はリストで返されるので、[0]で最初のノードを取得
#child1.detach() # ノードをツリーから削除
###
# "child1"という名前のノードにアクセスする
child1 = list(t.search_nodes(name="child1"))[0] # リストに変換してから最初のノードを取得
child1.detach() # ノードをツリーから削除
print(t)
# "child1"という名前のノードにアクセスする
child2 = list(t.search_nodes(name="child2"))[0] # リストに変換してから最初のノードを取得
# child2ノードの名前を変更
child2.name = "new_child2"
print(t)
# "Smad"という名前のノードにアクセスする
Smad = list(t.search_nodes(name="Smad"))[0] # リストに変換してから最初のノードを取得
# Smadノードの名前を変更
Smad.name = "Smad#"
print(t)
# "child1"という名前のノードにアクセスする
new_child2 = list(t.search_nodes(name="new_child2"))[0] # リストに変換してから最初のノードを取得
new_child2.detach()
# ファイルに書き込み
t . write ( parser = 1 , outfile = "new_tree.nwk" )これを実行するとこんな感じ
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$ python print.py
4.0.0-beta
3.1.3
Added tree tree-1 with id 0.
╭╴Agra
│ ╭─┬╴Cass
─┤ ╭─┤ ╰╴Smad
├─┤ ╰╴Dpon
│ ╰╴Sory
╰╴Tcas
╭╴Agra
│ ╭─┬╴Cass
│ ╭─┤ ╰╴Smad
─┼─┤ ╰╴Dpon
│ ╰╴Sory
├╴Tcas
├╴child1
╰╴child2
╭╴Agra
│ ╭─┬╴Cass
│ ╭─┤ ╰╴Smad
─┼─┤ ╰╴Dpon
│ ╰╴Sory
├╴Tcas
╰╴child2
╭╴Agra
│ ╭─┬╴Cass
│ ╭─┤ ╰╴Smad
─┼─┤ ╰╴Dpon
│ ╰╴Sory
├╴Tcas
╰╴new_child2
╭╴Agra
│ ╭─┬╴Cass
│ ╭─┤ ╰╴Smad#
─┼─┤ ╰╴Dpon
│ ╰╴Sory
├╴Tcas
╰╴new_child2
(ete4) kosukesano@at138:~/tools/for_ETE/test_241101$また、これでできたnew_tree.nwkはこんな感じ
(Agra:0.173734,(((Cass:0.204166,Smad#:0.219138):0.0351109,Dpon:0.464555):0.0267139,Sory:0.350028):0.0524272,Tcas:0.548018);葉に#をつけるだけならこれでいいか?
1105
マダラを抜いてオジロを入れた6種でのIQTREE結果
~/tools/for_IQTREE/241104_5sp_plusOjiro以下にrun.nex.treefileができていた
### run.nex.treefileの中身
(Agra:0.2094936641,(Cass:0.1912631030,(Dpon:0.2342863798,Ojir:0.1338157492)86:0.0164703012)100:0.0204609199,(Sory:0.2171214042,Tcas:0.5310501590)100:0.0698739346);合計10種でのOrthofinder結果
kosukesano@at139:~/tools/for_orthofinder$ ls
241019_6plusOjiro Orthofinder_241019.sh.po27076911 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26291666
241104_10sp RemakeHedder_6sp Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.o26310331
241104_5plusOjiro Smad_Agra_Cass_Dpon_Sory_Tcas_CDS_dir Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.pe26291666
CO1_6sp Smad_Agra_Cass_Dpon_Sory_Tcas_Dmel_fasta_dir Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.pe26310331
Orthofinder_240917_RH.sh Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.po26291666
Orthofinder_240917_RH.sh.e26802366 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.po26310331
Orthofinder_240917_RH.sh.o26802366 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.e26224546 make_philo_tree
Orthofinder_240917_RH.sh.pe26802366 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.o26224546 seven_sp.sh
Orthofinder_240917_RH.sh.po26802366 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.pe26224546 seven_sp.sh.e26639936
Orthofinder_241019.sh Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest.sh.po26224546 seven_sp.sh.o26639936
Orthofinder_241019.sh.e27076911 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh seven_sp.sh.pe26639936
Orthofinder_241019.sh.o27076911 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.e26291666 seven_sp.sh.po26639936
Orthofinder_241019.sh.pe27076911 Smad_Agra_Cass_Dpon_Sory_Tcas_orthotest_CDS.sh.e26310331
kosukesano@at139:~/tools/for_orthofinder$ ls 241104_10sp/OrthoFinder/Results_Nov04/
Citation.txt Gene_Trees Orthogroups Phylogenetically_Misplaced_Genes Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics Log.txt Orthologues Putative_Xenologs Species_Tree
Gene_Duplication_Events Orthogroup_Sequences Phylogenetic_Hierarchical_Orthogroups Resolved_Gene_Trees WorkingDirectory
kosukesano@at139:~/tools/for_orthofinder$ できてそう
系統樹のファイルを見てみる
### SpeciesTree_rooted.txtの中身
(Tcas:0.176472,((Agra:0.174713,(Cass:0.172946,((Ojiro:0.111109,Smad:0.12931)0.196244:0.0198427,(Pstr:0.302189,(Dfro:0.104938,(Dpon:0.0337761,Dval:0.0588316)0.422295:0.0411228)0.785665:0.139116)0.229497:0.0481502)0.113887:0.0180141)0.171089:0.0194423)0.502757:0.0371578,Sory:0.190036)1:0.176472);マダラとオジロがクレードを作っちゃってるな……。
合計10種でのIQTREE
~/tools/for_IQTREE/241105_10spを作成、その下でIQTREE_1.pyを実行した。
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ ls
IQTREE_1.py
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ python IQTREE_1.py
Orthogroup Agra Cass Dfro Dpon ... Ojiro Pstr Smad Sory Tcas
12598 OG0012598 Agra_P_050292700.1 Cass_AG9761214.1 Dfro_g15387.t1 Dpon_P_019755574.2 ... Ojir_g1996.t1 Pstr_g201064.t1 Smad_g6358.t1 Sory_P_030761209.1 Tcas_P_008195282.1
12600 OG0012600 Agra_P_050292798.1 Cass_AG9770235.1 Dfro_g4791.t1 Dpon_P_019769671.2 ... Ojir_g10006.t1 Pstr_g112651.t1 Smad_g12750.t1 Sory_P_030747529.1 Tcas_P_971970.1
12601 OG0012601 Agra_P_050292813.1 Cass_AG9770251.1 Dfro_g4797.t1 Dpon_P_019769634.2 ... Ojir_g10010.t1 Pstr_g123863.t1 Smad_g5261.t1 Sory_P_030747567.1 Tcas_P_968688.1
12602 OG0012602 Agra_P_050292817.1 Cass_AG9770190.1 Dfro_g4800.t1 Dpon_P_019769690.1 ... Ojir_g10011.t1 Pstr_g123862.t1 Smad_g5262.t1 Sory_P_030747568.1 Tcas_P_968766.1
12603 OG0012603 Agra_P_050292879.1 Cass_AG9762270.1 Dfro_g12395.t1 Dpon_P_019773117.1 ... Ojir_g9721.t1 Pstr_g92262.t1 Smad_g12600.t1 Sory_P_030759522.1 Tcas_P_972888.1
... ... ... ... ... ... ... ... ... ... ... ...
13550 OG0013550 Agra_P_050316219.1 Cass_AG9762145.1 Dfro_g12323.t1 Dpon_P_019755290.1 ... Ojir_g5306.t1 Pstr_g188737.t1 Smad_g695.t1 Sory_P_030763497.1 Tcas_P_974991.1
13553 OG0013553 Agra_P_050316250.1 Cass_AG9766496.1 Dfro_g5342.t1 Dpon_P_019767539.2 ... Ojir_g3020.t1 Pstr_g200977.t1 Smad_g4018.t1 Sory_P_030760091.1 Tcas_P_972970.1
13554 OG0013554 Agra_P_050316281.1 Cass_AG9761564.1 Dfro_g15225.t1 Dpon_P_019772888.1 ... Ojir_g1909.t1 Pstr_g200926.t1 Smad_g6322.t1 Sory_P_030758243.1 Tcas_P_967054.1
13557 OG0013557 Agra_P_050316372.1 Cass_AG9765979.1 Dfro_g12508.t1 Dpon_P_019758814.2 ... Ojir_g9692.t1 Pstr_g37198.t1 Smad_g465.t1 Sory_P_030763414.1 Tcas_P_971352.1
13558 OG0013558 Agra_P_050316407.1 Cass_AG9762302.1 Dfro_g12493.t1 Dpon_P_019758828.1 ... Ojir_g9703.t1 Pstr_g98775.t1 Smad_g9572.t1 Sory_P_030763403.1 Tcas_P_008196032.1
[466 rows x 11 columns]
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ ls続いてconcatinate.shを作成し実行した。
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ nano concatinate.sh
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ sh concatinate.sh
start at
Tue Nov 5 10:26:19 JST 2024
Tue Nov 5 10:26:21 JST 2024
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$ ls
IQTREE_1.py all_seq.fa concatinate.sh
kosukesano@at139:~/tools/for_IQTREE/241105_10sp$続いてIQTREE_2.pyを作り、実行した。実行時のコマンドは以下の通り。
python IQTREE_2.py all_seq.fa ../../for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/ManualPhylo_data/OG_list.txt 続いてMPT環境に入る。
source ~/tools/pyenv_env/ManualPhilo_profile 続いて~/tools/for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/ManualPhylo_data/に移動、OG_list.txtがあることを確認。
(MPT) kosukesano@at138:~/tools/for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/ManualPhylo_data$ ls OG_list.txt
OG_list.txt
(MPT) kosukesano@at138:~/tools/for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/ManualPhylo_data$このディレクトリでalign.shを作成、実行する。
続いて~/tools/for_IQTREE/241105_10spに移動し、IQTREE_3.pyを作成・実行した。
これにより*.maftted.trimed.edit.faファイルができたので、続いてmakerun.pyを実行した。
これによりrun.nexができたので、これを使ってmanualphylo.shをqsubで投げて完了!
ついでにmakealltree.shも投げてASTRALの前準備をしておく。
PANTHERの構築
牧野先生に紹介いただいたサイトからPANTHER19.0_hmmscoring.tgzをダウンロード、遺伝研に~/tools/for_pantherを作成しコピーした。 これを以下のコマンドで解凍
tar -tzvf PANTHER19.0_hmmscoring.tgzマダラ含む6種の昆虫ゲノムを用いたCAFE、マダラで有意に減少した遺伝子
down_Bc=read.csv("/Users/kosukesano/bio/for_cafe/241007_cafe_original_data/useIQTREE/Base_change.tab", sep="\t")
down_lines = readLines("/Users/kosukesano/bio/for_cafe/241007_cafe_original_data/useIQTREE/Base_asr.tre")
# TREESセクションのみを抽出する
trees_start <- which(grepl("BEGIN TREES;", down_lines))
trees_end <- which(grepl("END;", down_lines))
down_trees_lines <- lines[(trees_start + 1):(trees_end - 1)]
# 不要なスペースを削除
down_trees_lines <- gsub("^\\s+|\\s+$", "", down_trees_lines)
# データフレームに変換
library(tibble)
down_trees_df = tibble(Tree= down_trees_lines)
down_ex=down_trees_df|>###マダラで優位に増減したOGのOG番号を抽出したファイル
#lines|>
tidyr::separate(Tree, into = c("OG_num", "tree"), sep = r"(\s=\s)")|>#系統樹の文字列をOG番号の列とツリーの列に分割
dplyr::mutate(OG_num = stringr::str_extract(OG_num, "OG\\d+")) |>#OG番号の列の余計な文字を除去
dplyr::mutate(Dpon = stringr::str_detect(tree, pattern="Dpon<0>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)を検出
dplyr::mutate(Cass= stringr::str_detect(tree, pattern="Cass<1>\\*_")) |>#有意な差のある遺伝子ファミリー(*がついてるやつ)を検出
dplyr::mutate(Tcas = stringr::str_detect(tree, pattern="<2>\\*_")) |>
dplyr::mutate(Sory = stringr::str_detect(tree, pattern="<3>\\*_")) |>
dplyr::mutate(Smad = stringr::str_detect(tree, pattern="<4>\\*_")) |>
dplyr::mutate(Cass_Dpon = stringr::str_detect(tree, pattern="<5>\\*_")) |>
dplyr::mutate(Sory_Tcas = stringr::str_detect(tree, pattern="<6>\\*_")) |>
dplyr::mutate(Cass_Smad = stringr::str_detect(tree, pattern="<7>\\*_")) |>
dplyr::mutate(Agra = stringr::str_detect(tree, pattern="<8>\\*_")) |>
dplyr::mutate(all = stringr::str_detect(tree, pattern="<9>\\*_")) |>
dplyr::select(!c(tree))
down_Smad_ex = down_ex|>
dplyr::filter(Smad == "TRUE")|>
dplyr::filter(Cass_Smad == "FALSE")|>
dplyr::filter(Cass == "FALSE")|>
dplyr::filter(Cass_Dpon == "FALSE")|>
dplyr::filter(Dpon == "FALSE")|>
dplyr::filter(Agra == "FALSE")|>
dplyr::filter(Sory_Tcas == "FALSE")|>
dplyr::filter(Sory == "FALSE")|>
dplyr::filter(Tcas == "FALSE")|>
dplyr::filter(all == "FALSE")
down_Smad_bc=down_Bc |>###マダラで増加した0Gの0G番号を抽出したファイル
dplyr::select("FamilyID","Smad.4.") |>#OG番号の列とマダラでの遺伝子数の増減量が書いてある列のみを抽出
dplyr::mutate(Smad.4.= stringr::str_extract(Smad.4., r"(^-\d+)")) |>#マダラの遺伝子量増減の列のうち、-がついているものを抽出
tidyr::drop_na()|>
dplyr::filter(Smad.4. != 0)
down_Smad_df=dplyr::inner_join(down_Smad_bc, down_Smad_ex, by = c(FamilyID = "OG_num"))
# Orthogroups.tsvの読み込み
orthogroups <- ### OG番号とそれに対応するマダラ遺伝子IDのファイル
read.delim("/Users/kosukesano/bio/for_cafe/0930_orthofinder_data/Orthogroups.tsv", header=FALSE, sep="\t",
#stringsAsFactors=FALSE,
#col.names = "Data"
skip=1
)|>
dplyr::select("V1", "V5")
down_Smad_df2=dplyr::left_join(down_Smad_df, orthogroups, by = c(FamilyID = "V1"))|>#マダラで有意に増加したOGのOG番号とマダラの遺伝子IDを紐付ける
dplyr::select(!c(Smad.4.)) |>
print() FamilyID Dpon Cass Tcas Sory Smad Cass_Dpon Sory_Tcas Cass_Smad Agra
1 OG0000148 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
2 OG0000508 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
3 OG0000510 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
4 OG0001470 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
5 OG0002251 FALSE FALSE FALSE FALSE TRUE FALSE FALSE FALSE FALSE
all V5
1 FALSE Smad_g9618.t1
2 FALSE Smad_g861.t1, Smad_g861.t2
3 FALSE Smad_g5018.t1, Smad_g5018.t2
4 FALSE Smad_g8879.t1
5 FALSE Smad_g12072.t1
1107
オジロのBUSCO続き
忘れてた
~/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_OjiroのOjiro_busco.sh.e27077009を見てみる
### Ojiro_busco.sh.e27077009の中身
ERROR: Please do not provide a full path in --out parameter, no slash. Use out_path in the config.ini file to specify the full path.
ERROR: BUSCO analysis failed !
ERROR: Check the logs, read the user guide (https://busco.ezlab.org/busco_userguide.html), and check the BUSCO issue board on https://gitlab.com/ezlab/busco/issuesどうも出力ファイルは絶対・相対パスともに受け付けてもらえず、ファイル名だけをポンと指定しなければいけないらしい。
Ojiro_busco.shを書き換え、qsubで投げた。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
-m protein\
-i /home/kosukesano/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_Ojiro/Ojiro.fasta\
-o BUSCO_OUTPUT_Ojiro\
-l\
/home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
-f
date結果
# BUSCO version is: 5.1.3
# The lineage dataset is: (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_braker/Ojiro/gputest/braker/RemakeHedder_Ojiro/Ojiro.fasta
# BUSCO was run in mode: proteins
***** Results: *****
C:98.8%[S:80.4%,D:18.4%],F:0.2%,M:1.0%,n:1013
1000 Complete BUSCOs (C)
814 Complete and single-copy BUSCOs (S)
186 Complete and duplicated BUSCOs (D)
2 Fragmented BUSCOs (F)
11 Missing BUSCOs (M)
1013 Total BUSCO groups searched
Dependencies and versions:
hmmsearch: 3.1めっちゃ高いじゃん
PANTHER続き
この前の重たいファイルの解凍が終わった。
続いて、他のファイルをwgetで得る。
wget http://data.pantherdb.org/ftp/hmm_scoring/current_release/pantherScore2.2/lib/続いてhmmerをダウンロードして解凍。ダウンロード元はこちら
~/tools/for_panther/hmmer-3.1b2に入り、以下のコマンドを実行してビルド
.configure
makeそうするとこうなる
kosukesano@at139:~/tools/for_panther/hmmer-3.1b2$ ls
COPYRIGHT LICENSE Makefile.in Userguide.pdf config.guess config.status configure documentation install-sh profmark src tutorial
INSTALL Makefile README aclocal.m4 config.log config.sub configure.ac easel libdivsufsort release-notes testsuite
kosukesano@at139:~/tools/for_panther/hmmer-3.1b2$ EDTA_profileにhmmerのパスを書いた。
PATH=$PATH:/home/kosukesano/tools/for_panther/hmmer-3.1b2/src実際にPANTHERを動かしてみる
(EDTA2) kosukesano@at138:~/tools/for_panther/pantherScore2.2$ perl pantherScore2.2.pl -l ../target/famlib/rel/PANTHER19.0_altVersion/hmmscoring/PANTHER19.0/ -D B -V -i ../test.fa -o ../output.
txt -n
pantherScore2.2.pl starts at Thu Nov 7 19:49:08 2024
__________________________________________________
Verbose level is high.
Input fasta file is: ../test.fa
Display Type: B
library: ../target/famlib/rel/PANTHER19.0_altVersion/hmmscoring/PANTHER19.0/
Output file is: ../output.txt
__________________________________________________
pantherScore2.2.pl ends at Thu Nov 7 19:50:21 2024結果はこう。
### ~/tools/for_panther/output.txtの中身
SM1_g915.t1 PTHR46564:SF1 TRANSPOSASE 1.1e-17 66.9 88-320
SM1_g915.t1 PTHR46564 TRANSPOSASE 1.1e-17 66.9 88-320
SM1_g915.t2 PTHR46564 TRANSPOSASE 1.1e-17 66.9 109-341
SM1_g915.t2 PTHR46564:SF1 TRANSPOSASE 1.1e-17 66.9 109-341
g9808.t1 PTHR23022:SF119 TC1-LIKE TRANSPOSASE DDE DOMAIN-CONTAINING PROTEIN 9.4e-16 61.4 62-327できてそうじゃね?
1108
PANTHER続き
~/tools/for_panther/pantherScore2.2のpantherScore2.2.plを一部書き換えた。
書き換え前
# necessary libraries
use lib 'lib';
use FamLibBuilder;
use FastaFile;
use strict;
use FileHandle;後
# necessary libraries
#use lib 'lib';
use lib '/home/kosukesano/tools/for_panther/pantherScore2.2/lib';
use FamLibBuilder;
use FastaFile;
use strict;
use FileHandle;こうしたのちに、~/tools/for_panther/working_dir/241108_Madaraディレクトリでpanther_Madara.shを作成、実行した。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source /home/kosukesano/tools/pyenv_env/EDTA_profile
perl /home/kosukesano/tools/for_panther/pantherScore2.2/pantherScore2.2.pl\
-l /home/kosukesano/tools/for_panther/target/famlib/rel/PANTHER19.0_altVersion/hmmscoring/PANTHER19.0/\
-D B\
-V\
-i /home/kosukesano/tools/for_panther/working_dir/nama_data/Smad.fasta\
-o output.txt\
-n
dateMCOを使ったETEのテスト
~/tools/for_ETE/test_241108を作成、そこにglucose dehydrogenase遺伝子をコードする遺伝子ファミリーであるOG0000769の系統樹をコピーする。系統樹はヘッダー変更後のOrthofinder出力のものを使った
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ cp ~/tools/for_orthofinder/RemakeHedder_6sp/OrthoFinder/Results_Sep19/Gene_Trees/OG0000769_tree.txt ../test_241108
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ ls
OG0000769_tree.txt
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ 続いてconcatinate.shでRenamehedder_6spの.fastaファイルを全て結合したファイルを用意する。
続いて OG0000769の遺伝子IDの.txtファイルを作る。
OG0000769: Agra_P_050303580.1 Agra_P_050303581.1 Cass_AG9768182.1 Dpon_P_019767992.2 Dpon_P_048517880.1 Dpon_P_048518052.1 Smad_g2479.t1 Smad_g3654.t1 Smad_g3654.t2 Smad_g3655.t1 Smad_g3656.t1 Smad_g4097.t1 Smad_g4916.t1 Smad_g4916.t2 Sory_P_030758496.1 Sory_P_030758497.1 Sory_P_030758499.1 Sory_P_030758501.1 Tcas_P_968478.1これはOrthofinder出力のOrthogroups.txtから引っ張ってきた。
これを使ってIQTREE_2.pyを実行。
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ python IQTREE_2.py all_seq.fa OG0000769.txt
>Agra_P_050303580.1
MSFHACGCETTWVNPSIADTCSGNQYVVFMTLVDMLIRYACKISDPCGRIIPKTQPAAQYDFIVIGAGSGGSTIAGRLAEVNEWNTLLLEAGMDEPPATQIPAVPAFTNTLIDWNFTTQQESGACLSSNGICSWPRGKVLGGSSVFNGMMYMRGTPADYQRWVDAGNTEWSYDDLLPVFKASEGNRQVGSLVDEKYHGTKGPFTIQQFNSHPKLAEDILIAANQSGWPVSNDLNGDQFVGFAIAQTNNRDGARLSLAKAFVRPHKNNDNFDVMINSTVTKILIEGDGDNKRAYGVEFVYNGTTYTVNATKEVILAAGAVQTPQILLLSGIGPKEELDAVNIEQVHNLTGVGKGIKNHVSFSIVGTINETDVVDLNDESLAQYLSKGTGPLSGTGMSQLTARIPSNYTSPDDPDIQLFFSGMSNTCAYSGLPGLPTDPEDPSALRVLSIACVNLHPKSSGQISLLSNNPLDPPKIVANYFNHSDDIKVVLAGVRIAQKLMQSKIMQEKYNFTLQQYDYGNCSSLYEFDTDDFWECAIRYDTYPENHQSASCKIAPQSNEEACVNQRLQVYGISNLRITDASVIYTPTSGNIQAIIVAIAERASQFIREDYGIDSQI
.
.
.
.
.
.
.
>Tcas_P_968478.1
MSCCANEPYIGPPLDRTCFGGSYIVFMHLLNTLITQQCDVSEICQRINPQLQPDSEYDFVVIGGGAGGSVVAGRLSENPNWKILLIEAGGDEPPGSQVPSMMNNYLGDSQMDWRYRTEPQEMACLGRPGRRCDWPRGRVLGGSGVIHGMMYMRGLPSDYNEWEARGNEGWGYKDVEEYFKKSEGNRDIGDGVEGRYHSSDGPMLVQRFPDQPQIAEDVLRAGAELGYPVVGDLNGEQHWGFTIAQANIKNGSRLSSARAFLRPARNRPNLHVMINSTATKILINSNDTAKTISAVEFTYNNQSFTVKVRREAIVSAGAINTPHLLLLSGIGPREELDKVGIEQVHNLPGVGQNLKNHVSFAVNFQLTKIENYNDLNWNTVREYLTERRGPMSSTGVTQVAARISSKYANPDGKNPDLQFFFSGFLAHCSLSGGVKEPEDPTNPTAAKSFTIRPTFLRPRSRGFIGLNSRDPKEPPLMQPNYLTDEEDVKRMVAGIRIAQNLANTTILTTKYGIQMVNTDYGDCSRNYTFDSDEFWACALRYDTGPENHQSCSCKMGPASDPSAVVDPKLQVHGIEGLRIMDASVMPTVLSGNTHATVVMIAEKGSDYIKQKWSDK
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ ls
IQTREE_2.py OG0000769.txt OG0000769: OG0000769_tree.txt all_seq.fa concatinate.sh
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ less OG0000769:
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ mv OG0000769: OG0000769.fasta
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ ls
IQTREE_2.py OG0000769.fasta OG0000769.txt OG0000769_tree.txt all_seq.fa concatinate.sh
(EDTA2) kosukesano@at138:~/tools/for_ETE/test_241108$ 出力ファイルがOG0000769:とかいう変な名前だったので変更もしておいた。
この後にMPT環境に入り、align.shを実行。この時、OG0000769:のままでもよかった
1109
重信先生のマダラゲノムを使用したpanther
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ ls
Release_240921-SmiMad_GenePrediction_GM1.zip
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ unzip Release_240921-SmiMad_GenePrediction_GM1.zip
Archive: Release_240921-SmiMad_GenePrediction_GM1.zip
warning: stripped absolute path spec from /
mapname: conversion of failed
extracting: README.md
extracting: .Rhistory
extracting: braker.SmiMad_GM1.gff
extracting: braker.SmiMad_GM1.gff.t2g.txt
extracting: braker.SmiMad_GM1.gff.aa.fasta
extracting: braker.SmiMad_GM1.gff.cds.fasta
extracting: braker.SmiMad_GM1.gff.longest.tsv
extracting: braker.SmiMad_GM1.gff.nr.aa.fasta
extracting: braker.SmiMad_GM1.gff.nr.cds.fasta
extracting: braker.SmiMad_GM1.gff.transcript.fasta
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ ls
README.md braker.SmiMad_GM1.gff braker.SmiMad_GM1.gff.cds.fasta braker.SmiMad_GM1.gff.nr.aa.fasta braker.SmiMad_GM1.gff.t2g.txt
Release_240921-SmiMad_GenePrediction_GM1.zip braker.SmiMad_GM1.gff.aa.fasta braker.SmiMad_GM1.gff.longest.tsv braker.SmiMad_GM1.gff.nr.cds.fasta braker.SmiMad_GM1.gff.transcript.fasta
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ ls braker.SmiMad_GM1.gff.cds.fasta
braker.SmiMad_GM1.gff.cds.fasta
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ less braker.SmiMad_GM1.gff.cds.fasta
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ less README.md
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ cp braker.SmiMad_GM1.gff.cds.fasta ~/tools/for_panther/working_dir/nama_data/
(MPT) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ cd ~/tools/for_panther/working_dir/
(MPT) kosukesano@at138:~/tools/for_panther/working_dir$ ls
241108_Madara 241108_test nama_data
(MPT) kosukesano@at138:~/tools/for_panther/working_dir$ mkdir 241109_Sigenobu_Madara
(MPT) kosukesano@at138:~/tools/for_panther/working_dir$ cd 241109_Sigenobu_Madara/
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ cp ../241108_Madara/panther_Madara.sh panther_Sigenobu.sh
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ nano panther_Sigenobu.sh
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ qsub panther_Sigenobu.sh
Your job 27239810 ("panther_Sigenobu.sh") has been submitted
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ qstat
job-ID prior name user state submit/start at queue jclass slots ja-task-ID
------------------------------------------------------------------------------------------------------------------------------------------------
27235021 0.25410 panther_Ma kosukesano r 11/08/2024 17:10:29 gpu.q@igt011 6
27234958 0.25331 QLOGIN kosukesano r 11/08/2024 16:32:10 login.q@at138 1
27239810 0.00000 panther_Si kosukesano qw 11/09/2024 10:53:28 16
(MPT) kosukesano@at138:~/tools/for_panther/working_dir/241109_Sigenobu_Madara$ 1111
PANTHERの結果
自分のマダラの方のPANTHERが終わってたので見てみる
### ~/tools/for_panther/working_dir/241108_Madara/output.txtの中身の一部
1 Smad_g9309.t1 PTHR23226:SF416 FI01424P 2.8e-43 151.2 132-521
2 Smad_g10140.t29 PTHR23110:SF111 LONGITUDINALS LACKING PROTEIN, ISOFORMS F_I_K_T 1.6e-103 350.5 1-403
3 Smad_g6717.t2 PTHR47958:SF73 RNA HELICASE 7.1e-265 884.7 63-514
4 Smad_g13408.t1 PTHR24394 ZINC FINGER PROTEIN 5.6e-131 441.5 49-482
5 Smad_g1852.t1 PTHR10009:SF7 GH10609P-RELATED 2.3e-156 523.7 6-419
6 Smad_g11009.t1 PTHR45703:SF32 DYNEINS HEAVY CHAIN 0 6045.5 1-3926
7 Smad_g6907.t3 PTHR16154:SF6 SPINOPHILIN, ISOFORM J 0 1148.1 1-1768
8 Smad_g7989.t1 PTHR14710 GEM-ASSOCIATED PROTEIN 6 4e-33 117.8 6-159
9 Smad_g7989.t1 PTHR14710:SF2 GEM-ASSOCIATED PROTEIN 6 4e-33 117.8 6-159
10 Smad_g6622.t2 PTHR46763:SF1 DYNEIN REGULATORY COMPLEX PROTEIN 8 4.5e-65 221.7 8-163
.
.
.
.
.
16548 Smad_g5292.t1 PTHR24403 ZINC FINGER PROTEIN 1.6e-128 433.4 93-1274
16549 Smad_g10689.t1 PTHR12748 ORIGIN RECOGNITION COMPLEX SUBUNIT 3 6.4e-156 524.0 30-697
16550 Smad_g10665.t1 PTHR12081:SF43 TRANSCRIPTION FACTOR E2F1 3.2e-70 240.5 66-468
16551 Smad_g9538.t1 PTHR47027 REVERSE TRANSCRIPTASE DOMAIN-CONTAINING PROTEIN 1.3e-12 50.4 6-88
16552 Smad_g7813.t1 PTHR47611:SF3 HAT C-TERMINAL DIMERISATION DOMAIN-CONTAINING PROTEIN 2.8e-32 115.8 39-234
16553 Smad_g12219.t1 PTHR45877:SF2 E3 UBIQUITIN-PROTEIN LIGASE SINA-RELATED 1.5e-76 261.0 13-507
16554 Smad_g5017.t1 PTHR17605:SF0 RIBOSOME BIOGENESIS PROTEIN BOP1 3.3e-285 951.1 103-833
16555 Smad_g5017.t1 PTHR17605 RIBOSOME BIOGENESIS PROTEIN BOP1 BLOCK OF PROLIFERATION 1 PROTEIN 3.3e-285 951.1 103-833
16556 Smad_g11653.t1 PTHR43157 PHOSPHATIDYLINOSITOL-GLYCAN BIOSYNTHESIS CLASS F PROTEIN-RELATED 1.4e-88 299.9 9-305
16557 Smad_g10978.t1 PTHR48021:SF46 MAJOR FACILITATOR SUPERFAMILY (MFS) PROFILE DOMAIN-CONTAINING PROTEIN 2.4e-158 531.1 17-465
16558 Smad_g8495.t1 PTHR21411:SF0 REGULATORY PROTEIN ZESTE 2.6e-14 56.4 3-144
16559 Smad_g8495.t1 PTHR21411 APONTIC 2.6e-14 56.4 3-144
16560 Smad_g10140.t17 PTHR23110:SF111 LONGITUDINALS LACKING PROTEIN, ISOFORMS F_I_K_T 3.1e-93 316.6 1-332
16561 Smad_g10839.t1 PTHR22999:SF40 PX DOMAIN-CONTAINING PROTEIN KINASE-LIKE PROTEIN 6.4e-205 685.0 1-567アノテーションがしっかりついてる!
でも3〜4日かかったんだよなあ….。
これをローカルに転送。
output=read.csv("/Users/kosukesano/bio/for_panther/nama_data/241111_Madara_output/output.txt", sep="\t")|>
tidyr::separate(PANTHER_ID, into = c("PANTHER_ID", "family_ID"), sep = ":")
output|>
dplyr::count(gene_ID) |>
tail(n = 5) gene_ID n
14784 Smad_g9995.t1 1
14785 Smad_g9996.t1 2
14786 Smad_g9997.t1 1
14787 Smad_g9998.t1 1
14788 Smad_g9999.t1 1
### 14788遺伝子アウトプットファイルには1遺伝子につき複数のアノテーションがついていた。重複を消すためパターンをカウントすると14788遺伝子あった。
これについて、マッキーさんのトランスポゾンデータセットに該当するものを除去する
tp=read.csv("/Users/kosukesano/bio/for_panther/nama_data/transpsons_data/PANTHER11.0_transpsons.txt", sep="\t")
tp_family=read.csv("/Users/kosukesano/bio/for_panther/nama_data/transpsons_data/PANTHER11.0_transpsons_subfamily.txt", sep="\t")
tp_ID=dplyr::full_join(tp, tp_family, by = "PANTHER_ID")|>
tidyr::separate(PANTHER_ID, into = c("PANTHER_ID", "family_ID"), sep = ":")|>
dplyr::mutate(transpozon = stringr::str_replace_all(PANTHER_ID, "^.*.*$", "TRUE"))
tp_madara1 =dplyr::full_join(tp_ID, output, by = "PANTHER_ID")|>
tidyr::replace_na(list(transpozon="FALSE"))|>
dplyr::filter(stringr::str_detect(transpozon, "TRUE")==FALSE)|>
dplyr::count(gene_ID) |>
tail(n = 5)
tp_madara1 gene_ID n
14476 Smad_g9995.t1 1
14477 Smad_g9996.t1 2
14478 Smad_g9997.t1 1
14479 Smad_g9998.t1 1
14480 Smad_g9999.t1 1
重複を消すためパターンをカウントすると14480遺伝子あった。
思ったより少なかったので、アノテーション情報を参照にトランスポゾンを除去してみる。
filter= output|>
dplyr::mutate(transpozon = stringr::str_replace_all(gene_ID, "^.*.*$", "TRUE"))|>
dplyr::filter(stringr::str_detect(gene_function, "^.*TRANSPOSASE.*$")==TRUE |
stringr::str_detect(gene_function, "^.*TRANSPOSABLE.*$")==TRUE|
stringr::str_detect(gene_function, "^.*TRANSCRIPTASE.*$")==TRUE|
stringr::str_detect(gene_function, "^.*TRANSPOSON.*$")==TRUE)|>
dplyr::select(c(gene_ID, PANTHER_ID, transpozon))
tp_madara2=dplyr::full_join(filter, output, by = "gene_ID", relationship = "many-to-many")|>
tidyr::replace_na(list(transpozon="FALSE"))|>
dplyr::filter(stringr::str_detect(transpozon, "TRUE")==FALSE)|>
dplyr::count(gene_ID) |>
tail(n = 5)
tp_madara2 gene_ID n
14301 Smad_g9995.t1 1
14302 Smad_g9996.t1 2
14303 Smad_g9997.t1 1
14304 Smad_g9998.t1 1
14305 Smad_g9999.t1 1
重複を消すためパターンをカウントすると14305遺伝子あった。
じゃあこの2つの重複部分ってどうなんですか?ということで
filter1=dplyr::full_join(tp_ID, output, by = "PANTHER_ID")|>
tidyr::replace_na(list(transpozon="FALSE"))|>
dplyr::filter(stringr::str_detect(transpozon, "TRUE")==TRUE)|>
dplyr::count(gene_ID)
filter2= output|>
dplyr::mutate(transpozon = stringr::str_replace_all(gene_ID, "^.*.*$", "TRUE"))|>
dplyr::filter(stringr::str_detect(gene_function, "^.*TRANSPOSASE.*$")==TRUE |
stringr::str_detect(gene_function, "^.*TRANSPOSABLE.*$")==TRUE|
stringr::str_detect(gene_function, "^.*TRANSCRIPTASE.*$")==TRUE|
stringr::str_detect(gene_function, "^.*TRANSPOSON.*$")==TRUE)|>
dplyr::select(c(gene_ID, transpozon)) |>
dplyr::count(gene_ID)
filter3=tp_madara=dplyr::full_join(filter1, filter2, by = "gene_ID", relationship = "many-to-many")|>
dplyr::select(c(gene_ID)) |>
dplyr::mutate(transpozon = stringr::str_replace_all(gene_ID, "^.*.*$", "TRUE"))
tp_madara3=dplyr::full_join(filter3, output, by = "gene_ID", relationship = "many-to-many")|>
tidyr::replace_na(list(transpozon="FALSE"))|>
dplyr::filter(stringr::str_detect(transpozon, "TRUE")==FALSE)|>
dplyr::count(gene_ID) |>
tail(n = 5)
tp_madara3 gene_ID n
14141 Smad_g9996.t1 2
14142 Smad_g9997.t1 1
14143 Smad_g9998.t1 1
14144 Smad_g9999.t1 1
14145 <NA> 1
重複を消すためパターンをカウントすると14144遺伝子あった。
結構少なめ……。マッキーさんのデータはPANTHER11.0だったけど、自分のはPANTHER19.0なんだよな。これが理由か?
自分のPANTHER19.0のデータベースを見たけど、どれがどれだかわからなかった。マッキーさんに抽出方法を聞かなきゃ。
ちなみにマダラの遺伝子数はこう。
(MPT) kosukesano@at138:~/tools/for_braker/Madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 16,570 8,790,187 5 530.5 20,186
(MPT) kosukesano@at138:~/tools/for_braker/Madara/braker$1113
MCOを用いたPAMLやり直し
これまでのall_seq.faはアミノ酸配列を参照していた。CDS版のall_seq.faを新たに取得する。
### concatinate.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
# Enter the directory containing the fasta files
filesout="/home/kosukesano/tools/for_paml/data/241009_RemakeHedder_6sp_afterchange" ## Please replace with the actual directory containing the fasta files
# Define the output directory and output file
new="/home/kosukesano/tools/for_ETE/test_241108"
mkdir -p $new
# Concatenate all fasta files into one file
for file in "$filesout"/*.fasta; do
cat "$file" >> "${new}/CDS_all_seq.fa"
done
echo end at
dateこれで取ったCDS_all_seq.faをもとにOG0000769の配列を取り出し、アライメントする。
続いて~/tools/for_ETE/test_241108/bsA以下でbsA.ctlを実行する
### bsA.ctlの中身
seqfile = /home/kosukesano/tools/for_ETE/test_241108/OG0000769.maffted.trimed.fa
treefile = /home/kosukesano/tools/for_ETE/test_241108/OG0000769.nwk
outfile = /home/kosukesano/tools/for_ETE/test_241108/bsA/result/OG0000769_branch_alt
noisy = 9 * 0,1,2,3,9: how much rubbish on the screen
verbose = 1 * 1: detailed output, 0: concise output
runmode = 0 * 0: user tree; 1: semi-automatic; 2: automatic
* 3: StepwiseAddition; (4,5):PerturbationNNI
seqtype = 1 * 1:codons; 2:AAs; 3:codons-->AAs
CodonFreq = 2 * 0:1/61 each, 1:F1X4, 2:F3X4, 3:codon table
clock = 0 * 0: no clock, unrooted tree, 1: clock, rooted tree
model = 2
* models for codons:
* 0:one, 1:b, 2:2 or more dN/dS ratios for branches
NSsites = 2 * dN/dS among sites. 0:no variation, 1:neutral, 2:positive
icode = 0 * 0:standard genetic code; 1:mammalian mt; 2-10:see below
fix_kappa = 0 * 1: kappa fixed, 0: kappa to be estimated
kappa = 2 * initial or fixed kappa
fix_omega = 0 * 1: omega or omega_1 fixed, 0: estimate
omega = 1 * initial or fixed omega, for codons or codon-transltd AAs
fix_alpha = 1 * 0: estimate gamma shape parameter; 1: fix it at alpha
alpha = .0 * initial or fixed alpha, 0:infinity (constant rate)
Malpha = 0 * different alphas for genes
ncatG = 4 * # of categories in the dG or AdG models of rates
getSE = 0 * 0: don't want them, 1: want S.E.s of estimates
RateAncestor = 0 * (1/0): rates (alpha>0) or ancestral states (alpha=0)
method = 0 * 0: simultaneous; 1: one branch at a time
fix_blength = 0 * 0: ignore, -1: random, 1: initial, 2: fixed, 3: proportional(MPT) kosukesano@at138:~/tools/for_ETE/test_241108$ nano lrp.py
(MPT) kosukesano@at138:~/tools/for_ETE/test_241108$ python lrp.py
0.19340151084909837
(MPT) kosukesano@at138:~/tools/for_ETE/test_241108$有意な差はなかった。
ローカルでのPAML構築
公式Githubサイトを参考に行った。
~/tools/for_pamlで以下を実行
git clone https://github.com/abacus-gene/paml.git
cd paml
cd src
make -f Makefile
rm *.o
mkdir ../bin
mv baseml basemlg chi2 codeml evolver infinitesites mcmctree pamp yn00 ../bin~/tools/pyenv_env/ETE_profileに以下を追記
export PATH=/home/kosukesano/tools/for_paml/paml/bin:$PATH結局エラーは別理由っぽかったけど、備忘録として残しておく。
10種でのCAFE
:~/bio/for_cafe$ mkdir 241113_10sp_Orthofinder_data
:~/bio/for_cafe$ cd 241113_10sp_Orthofinder_data/
:~/bio/for_cafe/241113_10sp_Orthofinder_data$
:~/bio/for_cafe/241113_10sp_Orthofinder_data$ scp kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_orthofinder/241104_10sp/OrthoFinder/Results_Nov04/Orthogroups/Orthogroups.GeneCount.tsv
/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Orthogroups.GeneCount.tsv 100% 1254KB 10.5MB/s 00:00
:~/bio/for_cafe/241113_10sp_Orthofinder_data$ここで以下の通りにRを実行
### 1113
Orthologs_raw <- read_tsv(paste("/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/Orthogroups.GeneCount.tsv", sep = "/"))
##Enzanはorthogroupのなかで遺伝子数が変なやつを検出するためのmatrix
Enzan <- Orthologs_raw %>%
select(!c(Orthogroup, Total)) %>%
t()
##saidai, saisyouは各Orthogroupの中で、各種が持っているコピー数の最大値及び最小値を記したdf
saidai <- Enzan %>%
apply(2, max) %>%
as.data.frame() %>%
rename(max_real = ".")
saisyou <- Enzan %>%
apply(2, min) %>%
as.data.frame() %>%
rename(min_real = ".")
##Orthologs_1は各Orthogroupsの最大値、最小値もくっつけたdf
Orthologs_1 <- Orthologs_raw %>% select(!c(Total)) %>%
bind_cols(saidai, saisyou)
##最大値と最小値の差
Orthologs_2 <-Orthologs_1 %>%
mutate(sa = max_real - min_real) %>%
filter(max_real != min_real) %>%
filter(sa < 50)
##外れ値と遺伝子ファミリー数が全種で共通の行を省いた。最後に1列目を複製し列名をいじって、CAFEへのインプットデータの出来上がり。
Orthologs_3 <- Orthologs_2 %>%
mutate(Description = Orthogroup, ID = Orthogroup) %>%
relocate(Description, ID) %>%
select(!c(Orthogroup, max_real, min_real, sa))
#Orthologs_3 %>%
# write_tsv(paste("/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/Orthogroups.GeneCount2.tsv", sep = "/"))#, quote = FALSE) #,row.names = FALSE)
tree = read.tree("/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/run.nex.treefile")
mrca = getMRCA(tree, tip=c('Tcas', 'Sory')) #分岐年代推定に使うノードの指定
tree2 = chronopl(
tree,
100000,
age.min = 152.3, # 推定分岐年代の最小値(MYA)
age.max = 236.2, # 推定分岐年代の最大値(MYA)
node = mrca, # getMRCAで指定したノード
S = 1,
tol = 1e-20,
CV = FALSE,
eval.max = 500,
iter.max = 500
)
is.ultrametric(tree2) # ultrametricかどうか確認[1] TRUE
#write.tree(tree2, file = "/Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/tree_ultrametric.nwk") # ultrametric系統樹の保存遺伝研に転送
:~/bio/for_cafe/241113_10sp_Orthofinder_data$ scp /Users/kosukesano/bio/for_cafe/241113_10sp_Orthofinder_data/Orthogroups.GeneCount2.tsv tree_ultrametric.nwk kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_cafe/241113_10sp
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
Orthogroups.GeneCount2.tsv 100% 1467KB 17.5MB/s 00:00
tree_ultrametric.nwk 100% 276 28.7KB/s 00:00
:~/bio/for_cafe/241113_10sp_Orthofinder_data$ CAFE5を実行
kosukesano@at138:~/tools/for_cafe/241113_10sp$ singularity exec -e /usr/local/biotools/c/cafe:5.0.0--h5b5514e_2 cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk
Command line: /usr/local/bin/cafe5 -i Orthogroups.GeneCount2.tsv -t tree_ultrametric.nwk
Filtering families not present at the root from: 37427 to 9286
No root family size distribution specified, using uniform distribution
Optimizer strategy: Nelder-Mead with similarity cutoff
Iterations: 300
Expansion: 2
Reflection: 1
Starting Search for Initial Parameter Values
Lambda: 0.0015657989564863
Score (-lnL): 195030.33196575
Lambda: 0.0015657989564863
Score (-lnL): 195030.33196575
Lambda: 0.0016440889043106
.
.
.
.
.
.
Lambda: 0.0016435728328772
Score (-lnL): 194986.33229339
Completed 21 iterations
Time: 0H 0M 4S
Best match is: 0.001643591946634
Final -lnL: 194986.33229273
40 values were attempted (0% rejected)
Inferring processes for Base model
Score (-lnL): 194986.33229273
Maximum possible lambda for this topology: 0.0032212076114385
Computing pvalues...
done!
Starting reconstruction processes for Base model
Done!
kosukesano@at138:~/tools/for_cafe/241113_10sp$ 1115
配列長が最も長いIsoformの抽出、そのためのモジュール導入
高川くんが作ってくれたfaspというモジュールで最長のアイソフォームだけ取り出せるようなので、これを導入
kosukesano@at138:~/tools$ mkdir for_isoform_ex
kosukesano@at138:~/tools$ cd for_isoform_ex/
kosukesano@at138:~/tools/for_isoform_ex$ ls
kosukesano@at138:~/tools/for_isoform_ex$ tool下にfor_isoform_exディレクトリを作成。
kosukesano@at138:~/tools/for_isoform_ex$ python3 -m venv fasp
kosukesano@at138:~/tools/for_isoform_ex$ ls
fasp
kosukesano@at138:~/tools/for_isoform_ex$ source ~/tools/for_isoform_ex/fasp/bin/activate
(fasp) kosukesano@at138:~/tools/for_isoform_ex$fasp用のvenv環境を作成、それを立ち上げる。その時の立ち上げコマンドはsource ~/tools/for_isoform_ex/fasp/bin/activate
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ pip3 install git+https://github.com/tamasakian/fasp.git
Collecting git+https://github.com/tamasakian/fasp.git
Cloning https://github.com/tamasakian/fasp.git to /tmp/pip-req-build-kkooafn8
Running command git clone --filter=blob:none --quiet https://github.com/tamasakian/fasp.git /tmp/pip-req-build-kkooafn8
Resolved https://github.com/tamasakian/fasp.git to commit 64f590e29f3b8bbd8432bf851187ffea29d0a235
Installing build dependencies ... done
Getting requirements to build wheel ... done
Preparing metadata (pyproject.toml) ... done
Collecting biopython
Using cached biopython-1.84-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.2 MB)
Collecting numpy
Using cached numpy-2.1.3-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (16.3 MB)
Building wheels for collected packages: fasp
Building wheel for fasp (pyproject.toml) ... done
Created wheel for fasp: filename=fasp-0.0.1-py3-none-any.whl size=7776 sha256=35195170b52f87867e0cd5059a4e12b655fddd90be32b9ec107812ef812f07af
Stored in directory: /tmp/pip-ephem-wheel-cache-11owkspy/wheels/24/bb/a8/5c34a8384cbe3415028754571c2d0015f2486e23528fce65d2
Successfully built fasp
Installing collected packages: numpy, biopython, fasp
Successfully installed biopython-1.84 fasp-0.0.1 numpy-2.1.3
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ type python3
python3 is /lustre7/home/kosukesano/tools/for_isoform_ex/fasp/bin/python3
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ 実行のコマンドはこんな感じ
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Tcas.faa output_data/Tcas_iso1.faa nama_data/Tcas.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Tcas_iso1.faa faspのインストール
BRAKER出力ファイルに対する最長Isoformの抽出
/home/kosukesano/tools/for_isoform_exでtest.pyを作成、実行した。
#!/usr/bin/env python3
"""Library for processing protein FASTA files.
Functions
---------
exclude_isoforms_by_length: Exclude isoforms based on length.
exclude_non_nuclear_proteins: Exclude mitochondrial and chloroplast proteins.
"""
from Bio import SeqIO
def exclude_isoforms_by_length(input_filename: str, output_filename: str, gff3_file: str) -> None:
"""Exclude isoforms based on length.
Args
----
input_filename : str
Input protein FASTA filename.
output_filename : str
Output protein FASTA filename.
gff3_file : str
Input genome GFF3 filename.
"""
def parse_gff3(gff3_file: str) -> dict:
"""Parse GFF3 file and make dict with 'protein_id', 'start', 'end' and 'length' of each gene.
Args
----
gff3_file : str
Returns
-------
genes : dict
Dict with 'protein_id', 'start', 'end' and 'length' of each genes.
"""
genes = {}
with open(gff3_file, "r") as gff3_handle:
for line in gff3_handle:
## Exclude comments
if line.startswith("#"):
continue
li = line.strip().split("\t")
if len(li) != 9:
continue
ptg_num, tools, kind, start, end, score, strand, phase, attributes = li
## Exclude lines other than CDS.
if kind != "CDS":
continue
## Handle attributes.
attr_dict = {}
for attr in attributes.split("; "):
key_value = attr.split(" ")
##print(len(key_value))
if len(key_value) != 2:
continue
key, value = key_value
attr_dict[key] = value
## Exclude CDS without protein_id.
if "transcript_id" not in attr_dict:
print("trans")
continue
## Exclude CDS without protein_id.
if "gene_id" not in attr_dict:
print("gene_ID")
continue
## Read information of CDS.
protein_id = attr_dict["transcript_id"]
protein_id = protein_id.strip('"')
print(protein_id)
if "gene_id" in attr_dict:
gene = attr_dict["gene_id"]
start, end, length = int(start), int(end), int(end) - int(start)
if gene not in genes:
genes[gene] = []
genes[gene].append({"protein_id": protein_id, "start": start, "end": end, "length": length})
return genes
def select_longest_protein(genes: dict) -> dict:
"""Select the longest proteins for each gene based on CDS information.
Args
----
genes : dict
Returns
-------
longest_proteins : dict
"""
longest_proteins = {}
for gene, cds_list in genes.items():
protein_lengths = {}
for cds in cds_list:
protein_id = cds["protein_id"]
length = cds["length"]
if protein_id not in protein_lengths:
protein_lengths[protein_id] = length
else:
protein_lengths[protein_id] += length
## Select the longest protein.
longest_proteins[gene] = max(protein_lengths, key=protein_lengths.get)
return longest_proteins
def slice_proteins(input_filename: str, output_filename: str, longest_proteins: dict) -> None:
"""Slice FASTA file to retain only the longest proteins for each gene.
Args
----
input_filename : str
output_filename : str
longest_proteins : dict
"""
input_proteins = SeqIO.to_dict(SeqIO.parse(input_filename, "fasta"))
selected_protein_ids = set(longest_proteins.values())
output_proteins = []
for selected_protein_id in selected_protein_ids:
if selected_protein_id not in input_proteins:
continue
output_proteins.append(input_proteins[selected_protein_id])
with open(output_filename, "w") as output_handle:
SeqIO.write(output_proteins, output_handle, "fasta")
genes = parse_gff3(gff3_file)
longest_proteins = select_longest_protein(genes)
slice_proteins(input_filename, output_filename, longest_proteins)
exclude_isoforms_by_length("nama_data/Madara.aa", "output_data/Madara_iso1.aa", "nama_data/Madara.gtf")これを使う前にちゃんとfasp環境を起動しておくこと。
全部やるとこんな感じ
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ ls nama_data/
Agra.faa Agra.gff Cass.faa Cass.gff Dpon.faa Dpon.gff Madara.aa Madara.gtf Sory.faa Sory.gff Tcas.faa Tcas.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Agra.faa output_data/Agra_iso1.faa nama_data/Agra.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Agra_iso1.faa
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Cass.faa output_data/Cass_iso1.faa nama_data/Cass.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Cass_iso1.faa
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Dpon.faa output_data/Dpon_iso1.faa nama_data/Dpon.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Dpon_iso1.faa
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ python3 -m fasp exclude_isoforms_by_length nama_data/Sory.faa output_data/Sory_iso1.faa nama_data/Sory.gff
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/Sory_iso1.faa
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ less output_data/
output_data/ is a directory
(fasp) kosukesano@at138:~/tools/for_isoform_ex$ ls output_data/
Agra_iso1.faa Cass_iso1.faa Dpon_iso1.faa Madara_iso1.aa Sory_iso1.faa Tcas_iso1.faa
(fasp) kosukesano@at138:~/tools/for_isoform_ex$アイソフォームを抜いた状態でのOrthoFinder
~/tools/for_orthofinder/241115_6sp_isoを作成、その下でedit.pyを作成し実行した。
### edit.pyの中身
import os
from Bio import SeqIO
# 入力ディレクトリと出力ディレクトリのパス
input_dir = '/home/kosukesano/tools/for_isoform_ex/output_data/'
output_dir = '../241115_6sp_iso/'
# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 入力ディレクトリ内のすべての .faa または .aa ファイルを処理
for input_file in os.listdir(input_dir):
if input_file.endswith(('.faa', '.aa')):
input_path = os.path.join(input_dir, input_file)
output_path = os.path.join(output_dir, input_file)
# ファイル形式を設定
format_type = 'fasta' # Biopython では .faa も .aa も "fasta" 形式として扱う
with open(output_path, 'w') as outfile:
for record in SeqIO.parse(input_path, format_type):
header = record.description
seq = str(record.seq)
new_header = "" # 初期化
# ヘッダーが「g」で始まる場合
if header.startswith("g"):
number = header.split()[0] # ヘッダーの最初の番号部分を取得
new_header = f">Smad_{number}"
# ヘッダーが「]」で終わる場合
elif header.endswith("]"):
within_brackets = header.split('[')[-1].split(']')[0]
first_letter = within_brackets[0] # 最初の1文字
space_after = within_brackets.split()[-1][:3] # スペース後の3文字
first_part = header.split()[0][1:]
new_header = f">{first_letter}{space_after}_{first_part}"
# それ以外
else:
new_header = f">{header.split()[0]}"
# 新しいヘッダーと配列を出力ファイルに書き込む
outfile.write(f"{new_header}\n{seq}\n")
print(f"{output_path} に保存しました。")前のedit.pyに改良を加えてるよ。
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ python edit.py
../241115_6sp_iso/Cass_iso1.faa に保存しました。
../241115_6sp_iso/Sory_iso1.faa に保存しました。
../241115_6sp_iso/Dpon_iso1.faa に保存しました。
../241115_6sp_iso/Agra_iso1.faa に保存しました。
../241115_6sp_iso/Madara_iso1.aa に保存しました。
../241115_6sp_iso/Tcas_iso1.faa に保存しました。
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls
Agra_iso1.faa Cass_iso1.faa Dpon_iso1.faa Madara_iso1.aa Sory_iso1.faa Tcas_iso1.faa edit.pyエラーが怖かったので、一応拡張子を揃えておいた。
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls
Agra_iso1.faa Cass_iso1.faa Dpon_iso1.faa Madara_iso1.aa Sory_iso1.faa Tcas_iso1.faa edit.py
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ mv Madara_iso1.aa Smad_iso1.faa
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls
Agra_iso1.faa Cass_iso1.faa Dpon_iso1.faa Smad_iso1.faa Sory_iso1.faa Tcas_iso1.faa edit.py
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ここで、orthofinder_241115.shを作成しqsubで投げた。
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l intel
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
-f /home/kosukesano/tools/for_orthofinder/241115_6sp_iso\
-t 16
date1118
isoformを抜いた状態でのOrthofinder結果
Orthofinderはちゃんと動作して、結果も出力されてた。
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls
Agra_iso1.faa Dpon_iso1.faa Smad_iso1.faa Tcas_iso1.faa orthofinder_241115.sh orthofinder_241115.sh.o27262834 orthofinder_241115.sh.po27262834
Cass_iso1.faa OrthoFinder Sory_iso1.faa edit.py orthofinder_241115.sh.e27262834 orthofinder_241115.sh.pe27262834
(fasp) kosukesano@at138:~/tools/for_orthofinder/241115_6sp_iso$ ls OrthoFinder/Results_Nov15/
Citation.txt Gene_Trees Orthogroups Phylogenetically_Misplaced_Genes Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics Log.txt Orthologues Putative_Xenologs Species_Tree
Gene_Duplication_Events Orthogroup_Sequences Phylogenetic_Hierarchical_Orthogroups Resolved_Gene_Trees WorkingDirectorySingle Copy Orthologの数を見てみる
### Orthogroups_SingleCopyOrthologues.txtの末尾
5100 OG0008578
5101 OG0008579
5102 OG0008580
5103 OG0008581
5104 OG0008582
5105 OG0008583
5106 OG0008584
5107 OG0008585
5108 OG00085865108個のSCOが取れた。増えた。
Dfamデータベースを使用したRepeatMasker
Dfamからトランスポゾンのデータを取得する。
その中でもColeopteraのみでフィルタリングしたデータを.fastaで取得、遺伝研のソフトマスク用生データディレクトリに転送した。 ``bash :~/Downloads$ scp /Users/kosukesano/Downloads/dfam-fasta-download.fasta kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_softmask/nama_data/Dfam_coleoptera.fasta Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4 +–[ED25519 256]–+ | ..o.o…* o+ | | . . ..= + o* o| | . = oB +.| | +.oo .+E+o.| | .*S. o.o.+| | .o. . . .+| | .. + . o| | . ..oo . | | . .=. | +—-[SHA256]—–+ dfam-fasta-download.fasta 100% 6397KB 42.5MB/s 00:00
:~/Downloads$
その中身はこんな感じ
```bash
kosukesano@at139:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat Dfam_coleoptera.fasta
file format type num_seqs sum_len min_len avg_len max_len
Dfam_coleoptera.fasta FASTA DNA 3,358 6,329,269 98 1,884.8 20,756
kosukesano@at139:~/tools/for_softmask/nama_data$
~/tools/for_softmask/241118_Madara_softmaskディレクトリを作成、以下でconc.pyスクリプトを実行した。
### conc.pyの中身
import os
# 入力ファイルのパス
file1 = '/home/kosukesano/tools/for_softmask/nama_data/Dfam_coleoptera.fasta'
file2 = '/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta'
# 出力ディレクトリと出力ファイル名
output_dir = '/home/kosukesano/tools/for_softmask/241118_Madara_softmask'
output_file = os.path.join(output_dir, 'Madara_db.fasta')
# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# ファイルを結合して出力
with open(output_file, 'w') as outfile:
for input_file in [file1, file2]:
with open(input_file, 'r') as infile:
outfile.write(infile.read())
print(f"結合されたファイルが {output_file} に保存されました。")Dfamのデータベースとマダラのゲノムを結合するスクリプト。出力は/home/kosukesano/tools/for_softmask/241118_Madara_softmask/Madara_db.fasta。
上記の出力を使ってソフトマスクを行う。EDTA環境を立ち上げた状態で以下を行う。
BLASTデータベースの作成
(EDTA2) kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask$ BuildDatabase -name Madara_Dfam_DB Madara_db.fasta
Building database Madara_Dfam_DB:
Reading Madara_db.fasta...
Number of sequences (bp) added to database: 3567 ( 1301722634 bp )
(EDTA2) kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask$ ls
Madara_Dfam_DB.nhr Madara_Dfam_DB.nin Madara_Dfam_DB.njs Madara_Dfam_DB.nnd Madara_Dfam_DB.nni Madara_Dfam_DB.nog Madara_Dfam_DB.nsq Madara_Dfam_DB.translation Madara_db.fasta conc.py
(EDTA2) kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask$ RepeatModelerの実行
### Madara_ReoeatModeler.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Madara_Dfam_DB -pa 6
dateこれをqsubで投げた。
1119
重信先生のデータを用いたPANTHER
前に入れたやつが途中で止まってた。
ヤケクソでmedium・24slot指定でブン投げてみる。
1120
Dfamデータベースを使用したRepeatMasker続き
RepeatModelerが終わっていたので、RepeatMaskerに移る。
以下のスクリプトを投げた。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 12
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source /home/kosukesano/tools/pyenv_env/EDTA_profile
RepeatMasker -pa 10\
-xsmall\
-lib /home/kosukesano/tools/for_softmask/241118_Madara_softmask/RM_2232007.MonNov181355052024/consensi.fa.classified\
/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta\
-dir /home/kosukesano/tools/for_softmask/241118_Madara_softmask/output_dir
date-dirコマンドで出力のディレクトリを指定している。
ちなみにRepeatModeler出力のconsensi.fa.classifledを見てみる
kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask/RM_2232007.MonNov181355052024$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat consensi.fa.classified
file format type num_seqs sum_len min_len avg_len max_len
consensi.fa.classified FASTA DNA 5,536 3,541,598 30 639.7 16,988
kosukesano@at138:~/tools/for_softmask/241118_Madara_softmask/RM_2232007.MonNov181355052024$ 前のマダラゲノムのみで行なったやつはこれ。
kosukesano@at138:~/tools/for_softmask/RM_16988.WedMay221052072024$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat consensi.fa.classified
file format type num_seqs sum_len min_len avg_len max_len
consensi.fa.classified FASTA DNA 5,527 3,531,164 30 638.9 11,934
kosukesano@at138:~/tools/for_softmask/RM_16988.WedMay221052072024$あんまり変わってなくね?
このデータのマスキングが終わったので、BRAKERにかける
(EDTA2) kosukesano@at138:~/tools/for_softmask$ cp 241118_Madara_softmask/output_dir/231117_madaragenome.fasta.masked ~/tools/for_braker/nama_data/241120_madara_dfam.fasta
(EDTA2) kosukesano@at138:~/tools/for_softmask$ cd ~/tools/for_braker
(EDTA2) kosukesano@at138:~/tools/for_braker$ ls
241013_for_debag_madara Dfro Dval Ekam Femo Femo_pilon Kohuki Kohuki_thread_one Madara Ojiro OnlyProtein_femo OnlyProtein_madara Pstr Sigenobu_Madara nama_data
(EDTA2) kosukesano@at138:~/tools/for_braker$ mkdir 241120_madara_dfam
(EDTA2) kosukesano@at138:~/tools/for_braker$ cd 241120_madara_dfam/
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ cp ../241013_for_debag_madara/madara_braker.sh ../241120_madara_dfam/
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ ls
madara_braker.sh
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ less madara_braker.sh
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ nano madara_braker.sh
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ qsub madara_braker.sh
Your job 27278006 ("madara_braker.sh") has been submitted
(EDTA2) kosukesano@at138:~/tools/for_braker/241120_madara_dfam$ DfamのRepeatMasker用のデータベースも併せて統合したものを使ってマスキング
kosukesano@at138:~/tools/for_softmask$ mkdir 241120_Madara_softmaskDfamのダウンロードサイトからDfam-RepeatMasker.lib.gzをダウンロードし遺伝研で解凍、それを用いてデータベースを作成し、RepeatModlerをかけた。
Dockerのインストール
Dockerをホームページからインストールし、以下を実行した。
:~/bio/for_RepeatMasker_Docker$ docker login -u k05uke54n0
Password:
Login Succeeded
:~/bio/for_RepeatMasker_Docker$ docker pull dfam/tetools
Using default tag: latest
latest: Pulling from dfam/tetools
7600b3ee981a: Download complete
7605118baa98: Download complete
ffa0e8276bc9: Download complete
d72ae4f33534: Download complete
0616a07cf248: Download complete
4f4fb700ef54: Download complete
742e13a892ac: Download complete
3d2705dcb843: Download complete
dc8ad28c3cd1: Download complete
54ae706075d5: Download complete
44939d338867: Download complete
47bf88a48c47: Download complete
67af5d4f89bd: Download complete
81791adf7c7a: Download complete
8cd46d290033: Download complete
bce2500fb467: Download complete
12a7888856bd: Download complete
429d893a0445: Download complete
e59051f42299: Download complete
cf2ab9e656d9: Download complete
9cae1165f82b: Download complete
ce6de47b44b5: Download complete
513ea75e10b2: Download complete
06658812daff: Download complete
Digest: sha256:f60775010b4dfee18a92aea9191f66cd727d9764c5ba6142e03d3f7719604c28
Status: Downloaded newer image for dfam/tetools:latest
docker.io/dfam/tetools:latest
:~/bio/for_RepeatMasker_Docker$ docker container run -dit --mount type=bind,source="$PWD",target=/work --workdir /work --user "$(id -u):$(id -g)" --name dfamtet dfam/tetools
73f896bc2cd927361517e63d4fdc5242fe8b9287184e9326c4877881bd7aef94
:~/bio/for_RepeatMasker_Docker$ 1125
Dfamデータベース(Beetleのみ)を使用したBRAKER
結果はちゃんと出力されていた。
kosukesano@at139:~/tools/for_braker/241120_madara_dfam/braker$ ls
Augustus GeneMark-ETP braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff species what-to-cite.txt
kosukesano@at139:~/tools/for_braker/241120_madara_dfam/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 16,972 8,851,473 4 521.5 20,186
kosukesano@at139:~/tools/for_braker/241120_madara_dfam/braker$ ちなみにこれまでのやつはこんな感じ
### マダラゲノム(RNA_seqデータ含)
kosukesano@at137:~/tools/for_braker/Madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 16,570 8,790,187 5 530.5 20,186
######################################################################ちょっと遺伝子数が増えてるけど、劇的に変わっているわけではない?
Dfamデータベース(RepeatMasker用のデータベースも含む)を使用したRepeatModeler
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ ls RM_1157354.ThuNov211558022024/
consensi.fa consensi.fa.classified families-classified.stk families.stk round-1 round-2 round-3 round-4 round-5 round-6 tmpConsensi.fa
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ cp ../241118_Madara_softmask/Madara_RepeatMasker.sh
cp: missing destination file operand after '../241118_Madara_softmask/Madara_RepeatMasker.sh'
Try 'cp --help' for more information.
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ cp ../241118_Madara_softmask/Madara_RepeatMasker.sh ../241120_Madara_softmask/
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ ls
Madara_Dfam_DB-families.fa Madara_Dfam_DB.nin Madara_Dfam_DB.nni Madara_Dfam_DB.translation Madara_RepeatModeler.sh.e27277193 Madara_RepeatModeler.sh.po27277193 conc.py
Madara_Dfam_DB-families.stk Madara_Dfam_DB.njs Madara_Dfam_DB.nog Madara_RepeatMasker.sh Madara_RepeatModeler.sh.o27277193 Madara_db.fasta
Madara_Dfam_DB.nhr Madara_Dfam_DB.nnd Madara_Dfam_DB.nsq Madara_RepeatModeler.sh Madara_RepeatModeler.sh.pe27277193 RM_1157354.ThuNov211558022024
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ nano Madara_RepeatMasker.sh
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ mkdir output_dir
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ nano Madara_RepeatMasker.sh
kosukesano@at139:~/tools/for_softmask/241120_Madara_softmask$ qsub Madara_RepeatMasker.sh
Your job 27290166 ("Madara_RepeatMasker.sh") has been submittedqsubで投げた。
遺伝研環境でのRepeatMaskerの導入、およびDfamのデータベースを用いたマスキング
apptainerを用いて導入する。
kosukesano@at139:~/tools/for_RepeatMasker_Docker$ apptainer pull dfam-tetools_1.sif docker://dfam/tetools:1
INFO: Converting OCI blobs to SIF format
INFO: Starting build...
Getting image source signatures
Copying blob 742e13a892ac done |
Copying blob 7600b3ee981a done |
Copying blob 0616a07cf248 done |
Copying blob 8cd46d290033 done |
Copying blob 4f4fb700ef54 done |
Copying blob d72ae4f33534 done |
Copying blob dc8ad28c3cd1 done |
Copying blob 54ae706075d5 done |
Copying blob ce6de47b44b5 done |
Copying blob 47bf88a48c47 done |
Copying blob 9cae1165f82b done |
Copying blob bce2500fb467 done |
Copying blob 429d893a0445 done |
Copying blob 7605118baa98 done |
Copying blob ffa0e8276bc9 done |
Copying blob 3d2705dcb843 done |
Copying blob 44939d338867 done |
Copying blob 81791adf7c7a done |
Copying blob e59051f42299 done |
Copying blob cf2ab9e656d9 done |
Copying blob 06658812daff done |
Copying blob 12a7888856bd done |
Copying blob 513ea75e10b2 done |
Copying blob 67af5d4f89bd done |
Copying config da3bae2c6b done |
Writing manifest to image destination
2024/11/25 13:48:40 info unpack layer: sha256:8cd46d290033f265db57fd808ac81c444ec5a5b3f189c3d6d85043b647336913
2024/11/25 13:48:41 info unpack layer: sha256:7600b3ee981a7da30c6181a64e7a862ab42a7ef4eb5f4021770655123d90eaf4
2024/11/25 13:48:48 info unpack layer: sha256:d72ae4f33534d9e04d250010043e874741574cf74aaef61f4bbedcf4b27b6b5d
2024/11/25 13:48:52 info unpack layer: sha256:0616a07cf2481c7ce28b962bc2108052c0784c65602dd014ddb34bff4badf806
2024/11/25 13:48:52 info unpack layer: sha256:4f4fb700ef54461cfa02571ae0db9a0dc1e0cdb5577484a6d75e68dc38e8acc1
2024/11/25 13:48:52 info unpack layer: sha256:742e13a892ac834c62e9c0bd5d91e9798c932c99033ce82244afda4cf4f04314
2024/11/25 13:48:52 info unpack layer: sha256:dc8ad28c3cd175baccbdefca7b8e2c58c8a85a19c3686eb6af8d678dd0d32f8f
2024/11/25 13:48:57 info unpack layer: sha256:54ae706075d59d2da183faf12c8c4997eccadfa0fcfce4eee200fff3e72444b7
2024/11/25 13:48:57 info unpack layer: sha256:ce6de47b44b5618172427bf581e854e6f8a1b36c4e0b2408f1b5c1ce87440137
2024/11/25 13:48:57 info unpack layer: sha256:47bf88a48c4750f30f6cabf07d00bc730b16bab2f0211abac3861563a9a6bcb3
2024/11/25 13:48:57 info unpack layer: sha256:9cae1165f82bff3d4a918672c4ad0f4773cf51ece9de7d0201f2cde10ba2de85
2024/11/25 13:48:57 info unpack layer: sha256:bce2500fb46702c545e7f2a3c3f644085f353c81c866f49c5f8636cfb8bd365e
2024/11/25 13:48:58 info unpack layer: sha256:429d893a0445adeb93e1794c3a455f92e01cf4eecdb981059df9a83cf383d6e7
2024/11/25 13:48:59 info unpack layer: sha256:7605118baa98c83f22c50890d171c55949e2889fca04bcde16de350fe1aa38d3
2024/11/25 13:48:59 info unpack layer: sha256:ffa0e8276bc93f8cea95e70370fd76f73c5f0f2347b2b750556609aefc801d6e
2024/11/25 13:48:59 info unpack layer: sha256:3d2705dcb843a526a186434a876bc5fcdd8ff8d181877229968e364a987c62e1
2024/11/25 13:48:59 info unpack layer: sha256:44939d3388676d5eb2edbc758e3718b52941e38a38d75bde3ff27f2019ede7a7
2024/11/25 13:49:00 info unpack layer: sha256:81791adf7c7ad09fbd8b1eed958889ec4a957d08ee6ca97ed13df548d2671860
2024/11/25 13:49:00 info unpack layer: sha256:e59051f42299289bf4f74dbe3e22b715f1af42cc76baeece0ac7b3e25e6000f4
2024/11/25 13:49:00 info unpack layer: sha256:cf2ab9e656d9dbf726af2e2567e9291da2c96a453fbdf384558eb1b8a53407fa
2024/11/25 13:49:02 info unpack layer: sha256:06658812daff63fbc76ca6723fe76d8185a5fb4ccfed12fc44492bd57c43269f
2024/11/25 13:49:02 info unpack layer: sha256:12a7888856bdb1803b968e5bce63f3ae122cbffef3607eff3ff70498a67928b9
2024/11/25 13:49:02 info unpack layer: sha256:513ea75e10b2549874648c14dc87ae2113d3864707934096030f2e5805a23591
2024/11/25 13:49:02 info unpack layer: sha256:67af5d4f89bde4288ccddd26f2490c30ac03f41352c972bab703c6168fc0f064
INFO: Creating SIF file...
kosukesano@at139:~/tools/for_RepeatMasker_Docker$ ls
dfam-tetools_1.sif
kosukesano@at139:~/tools/for_RepeatMasker_Docker$ これを使ったRepeatMaskerの実行。以下のスクリプトをqsubで投げた。
### madara_softmask.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
RepeatMasker\
-pa 6\
-s\
-lib /home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/Dfam-RepeatMasker.lib\
-dir /home/kosukesano/tools/for_RepeatMasker_Docker/241125_madara/output_dir\
-xsmall\
-gff\
/home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/231117_madaragenome.fasta
echo end at
date1126
scorpionでのRepeatMasker環境の構築とマダラソフトマスクの実行
dendezia@scorpion:~/tool/for_RepeatMasker_Docker$ apptainer pull dfam-tetools_1.sif docker://dfam/tetools:1
INFO: Converting OCI blobs to SIF format
INFO: Starting build...
Copying blob 742e13a892ac done |
Copying blob 0616a07cf248 done |
Copying blob 4f4fb700ef54 done |
Copying blob d72ae4f33534 done |
Copying blob 8cd46d290033 done |
Copying blob 7600b3ee981a done |
Copying blob dc8ad28c3cd1 done |
Copying blob 54ae706075d5 done |
Copying blob ce6de47b44b5 done |
Copying blob 47bf88a48c47 done |
Copying blob 9cae1165f82b done |
Copying blob bce2500fb467 done |
Copying blob 429d893a0445 done |
Copying blob 7605118baa98 done |
Copying blob ffa0e8276bc9 done |
Copying blob 3d2705dcb843 done |
Copying blob 44939d338867 done |
Copying blob 81791adf7c7a done |
Copying blob e59051f42299 done |
Copying blob cf2ab9e656d9 done |
Copying blob 06658812daff done |
Copying blob 12a7888856bd done |
Copying blob 513ea75e10b2 done |
Copying blob 67af5d4f89bd done |
Copying config da3bae2c6b done |
Writing manifest to image destination
2024/11/26 10:43:03 info unpack layer: sha256:8cd46d290033f265db57fd808ac81c444ec5a5b3f189c3d6d85043b647336913
2024/11/26 10:43:06 info unpack layer: sha256:7600b3ee981a7da30c6181a64e7a862ab42a7ef4eb5f4021770655123d90eaf4
2024/11/26 10:43:20 info unpack layer: sha256:d72ae4f33534d9e04d250010043e874741574cf74aaef61f4bbedcf4b27b6b5d
2024/11/26 10:43:26 info unpack layer: sha256:0616a07cf2481c7ce28b962bc2108052c0784c65602dd014ddb34bff4badf806
2024/11/26 10:43:26 info unpack layer: sha256:4f4fb700ef54461cfa02571ae0db9a0dc1e0cdb5577484a6d75e68dc38e8acc1
2024/11/26 10:43:26 info unpack layer: sha256:742e13a892ac834c62e9c0bd5d91e9798c932c99033ce82244afda4cf4f04314
2024/11/26 10:43:26 info unpack layer: sha256:dc8ad28c3cd175baccbdefca7b8e2c58c8a85a19c3686eb6af8d678dd0d32f8f
2024/11/26 10:43:33 info unpack layer: sha256:54ae706075d59d2da183faf12c8c4997eccadfa0fcfce4eee200fff3e72444b7
2024/11/26 10:43:34 info unpack layer: sha256:ce6de47b44b5618172427bf581e854e6f8a1b36c4e0b2408f1b5c1ce87440137
2024/11/26 10:43:34 info unpack layer: sha256:47bf88a48c4750f30f6cabf07d00bc730b16bab2f0211abac3861563a9a6bcb3
2024/11/26 10:43:34 info unpack layer: sha256:9cae1165f82bff3d4a918672c4ad0f4773cf51ece9de7d0201f2cde10ba2de85
2024/11/26 10:43:34 info unpack layer: sha256:bce2500fb46702c545e7f2a3c3f644085f353c81c866f49c5f8636cfb8bd365e
2024/11/26 10:43:34 info unpack layer: sha256:429d893a0445adeb93e1794c3a455f92e01cf4eecdb981059df9a83cf383d6e7
2024/11/26 10:43:37 info unpack layer: sha256:7605118baa98c83f22c50890d171c55949e2889fca04bcde16de350fe1aa38d3
2024/11/26 10:43:38 info unpack layer: sha256:ffa0e8276bc93f8cea95e70370fd76f73c5f0f2347b2b750556609aefc801d6e
2024/11/26 10:43:38 info unpack layer: sha256:3d2705dcb843a526a186434a876bc5fcdd8ff8d181877229968e364a987c62e1
2024/11/26 10:43:39 info unpack layer: sha256:44939d3388676d5eb2edbc758e3718b52941e38a38d75bde3ff27f2019ede7a7
2024/11/26 10:43:39 info unpack layer: sha256:81791adf7c7ad09fbd8b1eed958889ec4a957d08ee6ca97ed13df548d2671860
2024/11/26 10:43:39 info unpack layer: sha256:e59051f42299289bf4f74dbe3e22b715f1af42cc76baeece0ac7b3e25e6000f4
2024/11/26 10:43:39 info unpack layer: sha256:cf2ab9e656d9dbf726af2e2567e9291da2c96a453fbdf384558eb1b8a53407fa
2024/11/26 10:43:42 info unpack layer: sha256:06658812daff63fbc76ca6723fe76d8185a5fb4ccfed12fc44492bd57c43269f
2024/11/26 10:43:42 info unpack layer: sha256:12a7888856bdb1803b968e5bce63f3ae122cbffef3607eff3ff70498a67928b9
2024/11/26 10:43:42 info unpack layer: sha256:513ea75e10b2549874648c14dc87ae2113d3864707934096030f2e5805a23591
2024/11/26 10:43:42 info unpack layer: sha256:67af5d4f89bde4288ccddd26f2490c30ac03f41352c972bab703c6168fc0f064
INFO: Creating SIF file...
dendezia@scorpion:~~/tool/for_RepeatMasker_Docker$ ls
dfam-tetools_1.sif
dendezia@scorpion:~/tool/for_RepeatMasker_Docker$ ここで/241126_madara/madara_RepeatMasker.shを作成、実行した。
### madara_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
apptainer exec /home/dendezia/tool/for_RepeatMasker_Docker/dfam-tetools_1.sif\
RepeatMasker\
-pa 6\
-s\
-lib /home/dendezia/tool/for_RepeatMasker_Docker/nama_data/Dfam-RepeatMasker.lib\
-dir /home/dendezia/tool/for_RepeatMasker_Docker/241126_madara/output_dir\
-xsmall\
-gff\
/home/dendezia/tool/for_RepeatMasker_Docker/nama_data/231117_madaragenome.fasta
echo end at
dateEDTAのRepeatMaskerでも同じことってできるのかな? #### scorpionでのEDTAを使ったRepeatMaskerの実行
#$ -S /bin/bash
#$ -cwd
echo start at
date
cd /home/dendezia/tool/for_softmask/241126_madara_EDTA/
source /home/dendezia/tool/pyenv_env/EDTA_profile
RepeatMasker\
-pa 6\
-s\
-lib /home/dendezia/tool/for_RepeatMasker_Docker/nama_data/Dfam-RepeatMasker.lib\
-dir /home/dendezia/tool/for_softmask/241126_madara_EDTA/output_dir\
-xsmall\
-gff\
/home/dendezia/tool/for_RepeatMasker_Docker/nama_data/231117_madaragenome.fasta
echo end at
dateこれをqsubで投げた。o
1127
Docker使用のRepeatMasker産物などを使ったBRAKER
現在、
DfamのRepeatMasker用データをEDTAのBuildDataBaseでデータベース化し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータDfamのRepeatMasker用データを-libでそのまま指定し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータDfamのRepeatMasker用データを-libでそのまま指定し、DockerのRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
の3パターンがある。
それぞれ~/tools/for_braker/nama_dataに
241127_madara_dfam_RMdata_buildDB.fasta241127_madara_dfam_RM_data_NotUsedBuildDB.fasta241127_madara_DockerRM.fastaとしてマスキングデータを保存。これを使ってBRAKERをかける。
~/tools/for_braker/241127_madaraを作成、その下でそれぞれのデータごとにディレクトリを分けて解析を行う。
kosukesano@at139:~/tools/for_braker/241127_madara$ ls
DockerRM dfam_RM_data_NotUsedBuildDB dfam_RMdata_buildDB
kosukesano@at139:~/tools/for_braker/241127_madara$各ディレクトリでmadara_braker.shを作成、qsubで投げた。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241127_madara_DockerRM.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=adult-1_1,adult-1_2,adult-2_1,adult-2_2,adult-3_1,adult-3_2,\
body-1_1,body-1_2,body-2_1,body-2_2,body-3_1,body-3_2,\
large-larva-1_1,large-larva-1_2,large-larva-2_1,large-larva-2_2,large-larva-3_1,large-larva-3_2,\
middle-larva-1_1,middle-larva-1_2,middle-larva-2_1,middle-larva-2_2,middle-larva-3_1,middle-larva-3_2,\
ovary-1_1,ovary-1_2,ovary-2_1,ovary-2_2,ovary-3_1,ovary-3_2 \
--rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Madara_RNAseq\
--threads=16\
--species=Smadaranus_241127_DockerRM\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
echo end at
dateメモ・RepeatMaskerのバージョンの確認
dockerで入れたRepeatMaskerのバージョンを見てみた。
RepeatMasker version 4.1.7-p1
Unknown option: version
/opt/RepeatMasker/RepeatMasker - 4.1.7-p11128
DfamライブラリとRepeatModeler出力のconsensi.fa.classifiedを結合させたファイルを-libに指定したRepeatMasker
RepeatModeler単体だとライブラリを参照してマスキングをかけるだけ、そこに入力するライブラリの種類が様々ある。RepeatModelerはマスクしたいゲノムからde novoでライブラリを作ってくれるらしい。またDfamには様々な生物のトランスポゾンをまとめたライブラリが存在する。
これ2つのライブラリを結合して、それをインプットにすればいいのでは?
マダラケシツブゾウムシのゲノムを元に作ったライブラリ、~/tools/for_softmask/RM_16988.WedMay221052072024/consensi.fa.classifiedとDfamのDfam-RepeatMasker.libを結合する。結合したファイルは241128_for_madara.libとして出力。
kosukesano@at138:~/tools/for_RepeatMasker_Docker/nama_data$ cat Dfam-RepeatMasker.lib ~/tools/for_softmask/RM_16988.WedMay221052072024/consensi.fa.classified > 241128_for_madara.lib
kosukesano@at138:~/tools/for_RepeatMasker_Docker/nama_data$ ls
231117_madaragenome.fasta 241128_for_madara.lib Dfam-RepeatMasker.lib
kosukesano@at138:~/tools/for_RepeatMasker_Docker/nama_data$これを元にしてRepeatMaskerをかける。~/tools/for_RepeatMasker_Docker/241128_madaraを作成し、その下でmadara_softmask.shを実行。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
RepeatMasker\
-pa 6\
-s\
-lib /home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/241128_for_madara.lib\
-dir /home/kosukesano/tools/for_RepeatMasker_Docker/241128_madara/output_dir\
-xsmall\
-gff\
/home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/231117_madaragenome.fasta
echo end at
date実行に時間がかかりそうだったから、scorpionにも241128_for_madara.libを送って同じことをする。
1129
DfamライブラリとRepeatModeler出力のconsensi.fa.classifiedを結合させたファイルを-libに指定したRepeatMasker結果・それを用いたBRAKER
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ ls
madara_softmask.sh madara_softmask.sh.e27301361 madara_softmask.sh.o27301361 madara_softmask.sh.pe27301361 madara_softmask.sh.po27301361 output_dir
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ ls output_dir/
231117_madaragenome.fasta.cat.gz 231117_madaragenome.fasta.masked 231117_madaragenome.fasta.out 231117_madaragenome.fasta.out.gff 231117_madaragenome.fasta.tbl
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ちゃんと出力されてた。
これを241129_madara_dfamplusbuilddb.fastaとしてコピー。
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ cp output_dir/231117_madaragenome.fasta.masked ~/tools/for_braker/nama_data/241129_madara_dfamplusbuilddb.fasta
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ これを使ってBRAKERを実行。
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ cp output_dir/231117_madaragenome.fasta.masked ~/tools/for_braker/nama_data/241129_madara_dfamplusbuilddb.fasta
kosukesano@at137:~/tools/for_RepeatMasker_Docker/241128_madara$ ~/tools/for_braker/241129_madaraディレクトリを作成、その下でmadara_braker.shを書き実行した。
### madara_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241129_madara_dfamplusbuilddb.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=adult-1_1,adult-1_2,adult-2_1,adult-2_2,adult-3_1,adult-3_2,\
body-1_1,body-1_2,body-2_1,body-2_2,body-3_1,body-3_2,\
large-larva-1_1,large-larva-1_2,large-larva-2_1,large-larva-2_2,large-larva-3_1,large-larva-3_2,\
middle-larva-1_1,middle-larva-1_2,middle-larva-2_1,middle-larva-2_2,middle-larva-3_1,middle-larva-3_2,\
ovary-1_1,ovary-1_2,ovary-2_1,ovary-2_2,ovary-3_1,ovary-3_2 \
--rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Madara_RNAseq\
--threads=16\
--species=Smadaranus_241129_DockerRM\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
echo end at
date2024年12月
1202
各マスキングの比較
- マダラのゲノムデータを
EDTAのBuildDataBaseでデータベース化し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ(元のやつ)
kosukesano@at137:~/tools/for_braker/Madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 16,570 8,790,187 5 530.5 20,186
######################################################################- 重信先生アノテーションのデータ
kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.SmiMad_GM1.gff.aa.fasta
file format type num_seqs sum_len min_len avg_len max_len
braker.SmiMad_GM1.gff.aa.fasta FASTA Protein 18,048 9,405,353 2 521.1 20,594
kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$
######################################################################Dfamの甲虫トランスポゾンデータをマダラのゲノムデータと結合し、EDTAのBuildDataBaseでデータベース化し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241120_madara_dfam/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 16,972 8,851,473 4 521.5 20,186
kosukesano@at138:~/tools/for_braker/241120_madara_dfam/braker$
######################################################################DfamのRepeatMasker用データをEDTAのBuildDataBaseでデータベース化し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241127_madara/dfam_RMdata_buildDB/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 17,600 9,052,417 5 514.3 20,186
kosukesano@at138:~/tools/for_braker/241127_madara/dfam_RMdata_buildDB/braker$
######################################################################DfamのRepeatMasker用データを-libでそのまま指定し、EDTA内のRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241127_madara/dfam_RM_data_NotUsedBuildDB/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 18,546 9,073,620 5 489.2 18,391
kosukesano@at138:~/tools/for_braker/241127_madara/dfam_RM_data_NotUsedBuildDB/braker$
#######################################################################DfamのRepeatMasker用データを-libでそのまま指定し、DockerのRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241127_madara/DockerRM/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 18,341 8,915,822 5 486.1 8,823
kosukesano@at138:~/tools/for_braker/241127_madara/DockerRM/braker$
#######################################################################DfamのRepeatMasker用データとマダラのゲノムデータをEDTAのBuildDataBaseでデータベース化したもの結合させ、-libに指定し、DockerのRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ
kosukesano@at138:~/tools/for_braker/241129_madara/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 15,150 8,400,426 5 554.5 20,186
kosukesano@at138:~/tools/for_braker/241129_madara/braker$
#######################################################################ちなみにEDTA内のRepeatMaskerのバージョンはこう。
(EDTA2) kosukesano@at138:~/tools/for_braker/Sigenobu_Madara$ RepeatMasker
RepeatMasker version 4.1.2-p1
No query sequence file indicated
/lustre7/home/kosukesano/.pyenv/versions/mambaforge-22.9.0-3/envs/EDTA2/bin/RepeatMasker - 4.1.2-p1
NAME
RepeatMasker - Mask repetitive DNA「DfamのRepeatMasker用データとマダラのゲノムデータをEDTAのBuildDataBaseでデータベース化したもの結合させ、-libに指定し、DockerのRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ」からアイソフォームを抜く
source ~/tools/for_isoform_ex/fasp/bin/activateでfasp環境を立ち上げておく。
その後、~/tools/for_braker/241129_madara以下でExIsoform.pyを作成、実行。
### ExIsoform.pyの中身
#!/usr/bin/env python3
"""Library for processing protein FASTA files.
Functions
---------
exclude_isoforms_by_length: Exclude isoforms based on length.
exclude_non_nuclear_proteins: Exclude mitochondrial and chloroplast proteins.
"""
from Bio import SeqIO
def exclude_isoforms_by_length(input_filename: str, output_filename: str, gff3_file: str) -> None:
"""Exclude isoforms based on length.
Args
----
input_filename : str
Input protein FASTA filename.
output_filename : str
Output protein FASTA filename.
gff3_file : str
Input genome GFF3 filename.
"""
def parse_gff3(gff3_file: str) -> dict:
"""Parse GFF3 file and make dict with 'protein_id', 'start', 'end' and 'length' of each gene.
Args
----
gff3_file : str
Returns
-------
genes : dict
Dict with 'protein_id', 'start', 'end' and 'length' of each genes.
"""
genes = {}
with open(gff3_file, "r") as gff3_handle:
for line in gff3_handle:
## Exclude comments
if line.startswith("#"):
continue
li = line.strip().split("\t")
if len(li) != 9:
continue
ptg_num, tools, kind, start, end, score, strand, phase, attributes = li
## Exclude lines other than CDS.
if kind != "CDS":
continue
## Handle attributes.
attr_dict = {}
for attr in attributes.split("; "):
key_value = attr.split(" ")
##print(len(key_value))
if len(key_value) != 2:
continue
key, value = key_value
attr_dict[key] = value
## Exclude CDS without protein_id.
if "transcript_id" not in attr_dict:
print("trans")
continue
## Exclude CDS without protein_id.
if "gene_id" not in attr_dict:
print("gene_ID")
continue
## Read information of CDS.
protein_id = attr_dict["transcript_id"]
protein_id = protein_id.strip('"')
print(protein_id)
if "gene_id" in attr_dict:
gene = attr_dict["gene_id"]
start, end, length = int(start), int(end), int(end) - int(start)
if gene not in genes:
genes[gene] = []
genes[gene].append({"protein_id": protein_id, "start": start, "end": end, "length": length})
return genes
def select_longest_protein(genes: dict) -> dict:
"""Select the longest proteins for each gene based on CDS information.
Args
----
genes : dict
Returns
-------
longest_proteins : dict
"""
longest_proteins = {}
for gene, cds_list in genes.items():
protein_lengths = {}
for cds in cds_list:
protein_id = cds["protein_id"]
length = cds["length"]
if protein_id not in protein_lengths:
protein_lengths[protein_id] = length
else:
protein_lengths[protein_id] += length
## Select the longest protein.
longest_proteins[gene] = max(protein_lengths, key=protein_lengths.get)
return longest_proteins
def slice_proteins(input_filename: str, output_filename: str, longest_proteins: dict) -> None:
"""Slice FASTA file to retain only the longest proteins for each gene.
Args
----
input_filename : str
output_filename : str
longest_proteins : dict
"""
input_proteins = SeqIO.to_dict(SeqIO.parse(input_filename, "fasta"))
selected_protein_ids = set(longest_proteins.values())
output_proteins = []
for selected_protein_id in selected_protein_ids:
if selected_protein_id not in input_proteins:
continue
output_proteins.append(input_proteins[selected_protein_id])
with open(output_filename, "w") as output_handle:
SeqIO.write(output_proteins, output_handle, "fasta")
genes = parse_gff3(gff3_file)
longest_proteins = select_longest_protein(genes)
slice_proteins(input_filename, output_filename, longest_proteins)
exclude_isoforms_by_length("braker/braker.aa", "241129_madara_iso1.aa", "braker/braker.gtf")これで、アイソフォームを抜いた241129_madara_iso1.aaができた。
(fasp) (EDTA2) kosukesano@at138:~/tools/for_braker/241129_madara$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat 241129_madara_iso1.aa
file format type num_seqs sum_len min_len avg_len max_len
241129_madara_iso1.aa FASTA Protein 12,337 6,131,098 5 497 20,186
(fasp) (EDTA2) kosukesano@at138:~/tools/for_braker/241129_madara$「DfamのRepeatMasker用データとマダラのゲノムデータをEDTAのBuildDataBaseでデータベース化したもの結合させ、-libに指定し、DockerのRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ」のBUSCO
~/tools/for_braker/241129_madaraにて以下のスクリプトを作成し、実行した。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 2
echo start at
date
date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
-m protein\
-i /home/kosukesano/tools/for_braker/241129_madara/braker/braker.aa\
-o BUSCO_OUTPUT_MADARA\
-l\
/home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
-fちなみに作業ノードで回したらこうなった。
(fasp) (EDTA2) kosukesano@at138:~/tools/for_braker/241129_madara$ singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco -m protein -i /home/kosukesano/tools/for_braker/241129_madara/braker/braker.aa -o BUSCO_OUTPUT_MADARA -l /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/ -f
INFO: ***** Start a BUSCO v5.1.3 analysis, current time: 12/02/2024 16:31:37 *****
INFO: Configuring BUSCO with local environment
INFO: Mode is proteins
INFO: 'Force' option selected; overwriting previous results directory
INFO: Downloading information on latest versions of BUSCO data...
INFO: Input file is /home/kosukesano/tools/for_braker/241129_madara/braker/braker.aa
INFO: Using local lineages directory /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/
INFO: Running BUSCO using lineage dataset (eukaryota, 2024-01-08)
INFO: ***** Run HMMER on gene sequences *****
INFO: Running 1013 job(s) on hmmsearch, starting at 12/02/2024 16:31:39
INFO: [hmmsearch] 102 of 1013 task(s) completed
INFO: [hmmsearch] 203 of 1013 task(s) completed
INFO: [hmmsearch] 304 of 1013 task(s) completed
INFO: [hmmsearch] 406 of 1013 task(s) completed
INFO: [hmmsearch] 507 of 1013 task(s) completed
INFO: [hmmsearch] 608 of 1013 task(s) completed
INFO: [hmmsearch] 710 of 1013 task(s) completed
INFO: [hmmsearch] 811 of 1013 task(s) completed
INFO: [hmmsearch] 912 of 1013 task(s) completed
INFO: [hmmsearch] 1013 of 1013 task(s) completed
INFO:
--------------------------------------------------
|Results from dataset |
--------------------------------------------------
|C:96.6%[S:83.4%,D:13.2%],F:0.7%,M:2.7%,n:1013 |
|979 Complete BUSCOs (C) |
|845 Complete and single-copy BUSCOs (S) |
|134 Complete and duplicated BUSCOs (D) |
|7 Fragmented BUSCOs (F) |
|27 Missing BUSCOs (M) |
|1013 Total BUSCO groups searched |
--------------------------------------------------
INFO: BUSCO analysis done. Total running time: 359 seconds
INFO: Results written in /home/kosukesano/tools/for_braker/241129_madara/BUSCO_OUTPUT_MADARA
INFO: For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
(fasp) (EDTA2) kosukesano@at138:~/tools/for_braker/241129_madara$1203
GeMoMaの導入
scorpionで実行した。
まず、EDTA環境を立ち上げてmambaを起動する。この状態でEDTA環境から抜けるとpyenvとmambaが使える状態でbase環境に入れる。
dendezia@scorpion:~/tool/pyenv_env$ source EDTA_profile
(EDTA2) dendezia@scorpion:~/tool/pyenv_env$ conda deactivate
(base) dendezia@scorpion:~/tool/pyenv_env$ 次に、gemoma環境を作る。
(base) dendezia@scorpion:~/tool/pyenv_env$ mamba create -n gemoma -y
__ __ __ __
/ \ / \ / \ / \
/ \/ \/ \/ \
███████████████/ /██/ /██/ /██/ /████████████████████████
/ / \ / \ / \ / \ \____
/ / \_/ \_/ \_/ \ o \__,
/ _/ \_____/ `
|/
███╗ ███╗ █████╗ ███╗ ███╗██████╗ █████╗
████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
██╔████╔██║███████║██╔████╔██║██████╔╝███████║
██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
██║ ╚═╝ ██║██║ ██║██║ ╚═╝ ██║██████╔╝██║ ██║
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝
mamba (1.1.0) supported by @QuantStack
GitHub: https://github.com/mamba-org/mamba
Twitter: https://twitter.com/QuantStack
█████████████████████████████████████████████████████████████
Looking for: []
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
To activate this environment, use
$ mamba activate gemoma
To deactivate an active environment, use
$ mamba deactivate
(base) dendezia@scorpion:~/tool/pyenv_env$ mamba activate gemoma
(gemoma) dendezia@scorpion:~/tool/pyenv_env$空のgemoma環境を立ち上げる。
その中で、GeMoMaをインストールする。
(gemoma) dendezia@scorpion:~/tool/pyenv_env$ mamba install -c conda-forge -c bioconda gemoma=1.9 -y
__ __ __ __
/ \ / \ / \ / \
/ \/ \/ \/ \
███████████████/ /██/ /██/ /██/ /████████████████████████
/ / \ / \ / \ / \ \____
/ / \_/ \_/ \_/ \ o \__,
/ _/ \_____/ `
|/
███╗ ███╗ █████╗ ███╗ ███╗██████╗ █████╗
████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
██╔████╔██║███████║██╔████╔██║██████╔╝███████║
██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
██║ ╚═╝ ██║██║ ██║██║ ╚═╝ ██║██████╔╝██║ ██║
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝
mamba (1.1.0) supported by @QuantStack
GitHub: https://github.com/mamba-org/mamba
Twitter: https://twitter.com/QuantStack
█████████████████████████████████████████████████████████████
Looking for: ['gemoma=1.9']
bioconda/noarch 5.3MB @ 4.8MB/s 1.2s
bioconda/linux-64 5.7MB @ 4.4MB/s 1.3s
conda-forge/noarch 20.3MB @ 7.0MB/s 3.1s
conda-forge/linux-64 47.2MB @ 7.9MB/s 6.7s
Transaction
Prefix: /home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/envs/gemoma
Updating specs:
- gemoma=1.9
.
.
.
.
.これでGeMoMaのインストールは完了。
ヘルプを出してみる。
(gemoma) dendezia@scorpion:~/tool/pyenv_env$ GeMoMa -h
Searching for the new GeMoMa updates ...
You are using the latest GeMoMa version.
This jar allows to run all parts of GeneModelMapper (GeMoMa) except the external search algorithm (e.g. tblastn).
For more information please visit http://www.jstacs.de/index.php/GeMoMa
If you have any questions, comments or bugs, please check FAQs on our homepage, our github page https://github.com/Jstacs/Jstacs/labels/GeMoMa or contact jens.keilwagen@julius-kuehn.de
If you use this tool, please cite
@article{Keilwagen:2016:GeMoMa,
author = {Keilwagen, Jens and Wenk, Michael and Erickson, Jessica L. and Schattat, Martin H. and Grau, Jan and Hartung, Frank},
title = {{Using intron position conservation for homology-based gene prediction}},
journal = {Nucleic Acids Research},
volume = {44},
number = {9},
pages = {e89-e89},
year = {2016},
month = {02},
issn = {0305-1048},
doi = {10.1093/nar/gkw092}
}
@article{Keilwagen:2018:GeMoMa_RNAseq,
author = {Keilwagen, Jens and Hartung, Frank and Paulini, Michael and Twardziok, Sven O. and Grau, Jan},
title = {Combining RNA-seq data and homology-based gene prediction for plants, animals and fungi},
journal = {BMC Bioinformatics},
year = {2018},
month = {May},
day = {30},
volume = {19},
number = {1},
pages = {189},
issn = {1471-2105},
doi = {10.1186/s12859-018-2203-5}
}
.
.
.
.
.
.いけてそう。
環境立ち上げプロファイルを作成、~/tool/pyenv_envにgemoma_profileを書いた。
### gemoma_profileの中身
source ~/.bash_profile
source ~/pyenv_conda_environment/.pyenv_profile
pyenv global mambaforge-22.9.0-3
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3//etc/profile.d/conda.sh" ]; then
. "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/conda.sh"
else
export PATH="/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/bin:$PATH"
fi
fi
unset __conda_setup
if [ -f "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh" ]; then
. "/home/dendezia/.pyenv/versions/mambaforge-22.9.0-3/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<
conda activate gemomaこれを遺伝研でもやった。
遺伝研にて、~/tools/for_gemoma/241203_testを作成、以下のスクリプトを実行した。
### madara_gemoma.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 8
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source /home/kosukesano/tools/pyenv_env/gemoma_profile
GeMoMa GeMoMaPipeline \
t=/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=8 \
outdir=/home/kosukesano/tools/for_gemoma/241203_test/madara_out
echo end at
dateマダラのRNA-seqのBAMファイル
240430_ddbj_backup 240529_RNAseq kosukesano_oldPC old_file 山昆生データ
240514_new_weebil_genome 240705 merged_madara.zip sano
:/Volumes/Elements_1$ pwd
/Volumes/Elements_1
:/Volumes/Elements_1$ pwd
/Volumes/Elements_1
:/Volumes/Elements_1$ cd
:~$ scp /Volumes/Elements_1/merged_madara.zip kosukesano@gw.ddbj.nig.ac.jp:/home/kosukesano/tools/for_gemoma/nama_data
Host key fingerprint is SHA256:Gl1jROYNIyJS1T7yMPvX8J68VfcWjdKVobZkLS1lmG4
+--[ED25519 256]--+
| ..o.o...* o+ |
| . . ..= + o* o|
| . = oB +.|
| +.oo .+E+o.|
| .*S. o.o.+|
| .o. . . .+|
| .. + . o|
| . ..oo . |
| . .=. |
+----[SHA256]-----+
merged_madara.zip (fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$ ls
merged_madara.zip reference
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$ unzip merged_madara.zip
Archive: merged_madara.zip
inflating: merged_madara.bam
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$ ls
merged_madara.bam merged_madara.zip reference
(fasp) (EDTA2) kosukesano@at138:~/tools/for_gemoma/nama_data$1204
scorpion環境下でのGeMoMa
~/tool/for_gemoma/241204_test下でmadara_gemoma.shを作成、qsubで投げた。
### scorpionのmadara_gemoma.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
source /home/dendezia/tool/pyenv_env/gemoma_profile
GeMoMa GeMoMaPipeline \
t=/home/dendezia/tool/for_gemoma/nama_data/231117_madaragenome.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=1 \
outdir=/home/dendezia/tool/for_gemoma/241204_test/madara_out
echo end at
datescorpion環境下でのpanther
~/tool/for_pantherを作成、そこで以下のコマンドを実行した。
wget -r http://data.pantherdb.org/ftp/hmm_scoring/current_release/pantherScore2.2/1205
scorpion環境下でのpanther続き
こんな感じになった
dendezia@scorpion:~/tool/for_panther$ ls
data.pantherdb.org
dendezia@scorpion:~/tool/for_panther$ ls data.pantherdb.org/
ftp index.html
dendezia@scorpion:~/tool/for_panther$ ls data.pantherdb.org/ftp/
CellDesigner biopax downloads hmm_classifications index.html ortholog panther_library peregrine_data sequence_classifications tools
TIPS cSNP_analysis generic_mapping hmm_scoring linkouts panther_interpro pathway pub tmp vsftpd
dendezia@scorpion:~/tool/for_panther$ ls data.pantherdb.org/ftp/panther_library/
11.1 12.0 13.0 13.1 14.0 14.1 15.0 16.0 17.0 18.0 19.0 current_release index.html
dendezia@scorpion:~/tool/for_panther$ ls data.pantherdb.org/ftp/panther_library/current_release/
index.html
dendezia@scorpion:~/tool/for_panther$ なんか思ってたのと違うな。
以下のコマンドを実行
dendezia@scorpion:~/tool/for_panther$ wget -r http://data.pantherdb.org/ftp/hmm_scoring/current_release/pantherScore2.2/lib/1209
scorpion環境下でのpanther続き
hummerのファイルを持ってきて、解凍。
:~/Downloads$ scp hmmer-3.1b2.tar.gz dendezia@scorpion:/home/dendezia/tool/for_panther
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
| .+. .=o=+.|
| o*.o.=.*+|
| oo.*oo B.o|
| ..o= +.* ..|
| o .+S o * . |
| . o. . E |
| ....o |
| oo+ |
| o= |
+----[SHA256]-----+
hmmer-3.1b2.tar.gz 100% 5825KB 101.2MB/s 00:00
:~/Downloads$tar -zxvf hmmer-3.1b2.tar.gzこれを実行すると、hmmer-3.1b2/ディレクトリができるので、その中に入る。
makeしよう。
dendezia@scorpion:~/tool/for_panther/hmmer-3.1b2$ ./configure
dendezia@scorpion:~/tool/for_panther/hmmer-3.1b2$ make1212
スロット数とか色々変えたGeMoMa
~/tools/for_gemoma/241212を作成し、そこで色々やった。
kosukesano@at137:~/tools/for_gemoma/241212$ ls
GeMoMa_temp gemoma_slot_1.sh gemoma_slot_8.sh gemoma_slot_8_gpu.sh madara_gemoma.sh
kosukesano@at137:~/tools/for_gemoma/241212$ - スロット数
1、medium指定
### gemoma_slot_1.sh
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 1
#$ -l s_vmem=125G,mem_req=125G
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando
ulimit -s unlimited
echo start at
date
source /home/kosukesano/tools/pyenv_env/gemoma_profile
GeMoMa GeMoMaPipeline \
t=/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=1 \
outdir=/home/kosukesano/tools/for_gemoma/241212/slot_1_out
echo end at
date- スロット数
8、medium指定
### gemoma_slot_8.sh
#$ -S /bin/bash
#$ -cwd
#$ -l medium
#$ -pe def_slot 8
#$ -l s_vmem=125G,mem_req=125G
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando
ulimit -s unlimited
echo start at
date
source /home/kosukesano/tools/pyenv_env/gemoma_profile
GeMoMa GeMoMaPipeline \
t=/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=8 \
outdir=/home/kosukesano/tools/for_gemoma/241212/slot_8_out
echo end at
date- スロット数
8、gpu指定
### gemoma_slot_8_gpu.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 8
#$ -l s_vmem=125G,mem_req=125G
#$ -o ~/results_sh_eando
#$ -e ~/results_sh_eando
ulimit -s unlimited
echo start at
date
source /home/kosukesano/tools/pyenv_env/gemoma_profile
GeMoMa GeMoMaPipeline \
t=/home/kosukesano/tools/for_softmask/nama_data/231117_madaragenome.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=8 \
outdir=/home/kosukesano/tools/for_gemoma/241212/slot_8_gpu_out
echo end at
dateローカルでのPyenv実装
pyenvをgitでインストール
:~$ git clone https://github.com/yyuu/pyenv.git ~/.pyenv
Cloning into '/Users/kosukesano/.pyenv'...
remote: Enumerating objects: 25324, done.
remote: Counting objects: 100% (2077/2077), done.
remote: Compressing objects: 100% (244/244), done.
remote: Total 25324 (delta 1913), reused 1905 (delta 1822), pack-reused 23247 (from 1)
Receiving objects: 100% (25324/25324), 5.62 MiB | 17.70 MiB/s, done.
Resolving deltas: 100% (17063/17063), done.
:~$ ls -a
. .Rapp.history .bash_profile .cups .ncbi .viminfo Applications Library Public
.. .Rhistory .bash_sessions .docker .pyenv .vscode Desktop Movies bin
.CFUserTextEncoding .Trash .bashrc .lesshst .python_history .vscode-R Documents Music bio
.DS_Store .bash_history .config .local .ssh .wget-hsts Downloads Pictures bioinfo
:~$pyenv_conda_environmentディレクトリを作成し、pyenv_profileを作る。
:~$ mkdir pyenv_conda_environment
:~$ cd pyenv_conda_environment/
:~/pyenv_conda_environment$ nano pyenv_profile
:~/pyenv_conda_environment$ ### pyenv_profile
export PYENV_ROOT="$HOME/.pyenv"
export PATH="$PYENV_ROOT/bin:$PATH"
eval "$(pyenv init -)"sourceしてヘルプを見る。実行できてるね。
:~/pyenv_conda_environment$ source pyenv_profile
:~/pyenv_conda_environment$ pyenv
pyenv 2.4.22-1-ga2ad48aa
Usage: pyenv <command> [<args>]
Some useful pyenv commands are:
--version Display the version of pyenv
commands List all available pyenv commands
exec Run an executable with the selected Python version
global Set or show the global Python version(s)
help Display help for a command
hooks List hook scripts for a given pyenv command
init Configure the shell environment for pyenv
install Install a Python version using python-build
latest Print the latest installed or known version with the given prefix
local Set or show the local application-specific Python version(s)
prefix Display prefixes for Python versions
rehash Rehash pyenv shims (run this after installing executables)
root Display the root directory where versions and shims are kept
shell Set or show the shell-specific Python version
shims List existing pyenv shims
uninstall Uninstall Python versions
version Show the current Python version(s) and its origin
version-file Detect the file that sets the current pyenv version
version-name Show the current Python version
version-origin Explain how the current Python version is set
versions List all Python versions available to pyenv
whence List all Python versions that contain the given executable
which Display the full path to an executable
See `pyenv help <command>' for information on a specific command.
For full documentation, see: https://github.com/pyenv/pyenv#readme
:~/pyenv_conda_environment$ローカルでのGeMoMa実装
pyenvのmambaforge環境を用意する。
:~/pyenv_conda_environment$ pyenv install mambaforge-23.10.0-0
Downloading Mambaforge-23.10.0-0-MacOSX-x86_64.sh.sh...
-> https://github.com/conda-forge/miniforge/releases/download/23.10.0-0/Mambaforge-23.10.0-0-MacOSX-x86_64.sh
Installing Mambaforge-23.10.0-0-MacOSX-x86_64.sh...
Channels:
- conda-forge
Platform: osx-64
Collecting package metadata (repodata.json): done
Solving environment: done
==> WARNING: A newer version of conda exists. <==
current version: 23.10.0
latest version: 24.11.0
Please update conda by running
$ conda update -n base -c conda-forge conda
## Package Plan ##
environment location: /Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0
added / updated specs:
- conda=23.10.0
- pip
The following packages will be downloaded:
package | build
---------------------------|-----------------
ca-certificates-2024.8.30 | h8857fd0_0 155 KB conda-forge
certifi-2024.8.30 | pyhd8ed1ab_0 160 KB conda-forge
openssl-3.4.0 | hd471939_0 2.5 MB conda-forge
pip-24.3.1 | pyh8b19718_0 1.2 MB conda-forge
------------------------------------------------------------
Total: 4.0 MB
The following packages will be UPDATED:
ca-certificates 2023.11.17-h8857fd0_0 --> 2024.8.30-h8857fd0_0
certifi 2023.11.17-pyhd8ed1ab_0 --> 2024.8.30-pyhd8ed1ab_0
openssl 3.2.0-hd75f5a5_0 --> 3.4.0-hd471939_0
pip 23.3.1-pyhd8ed1ab_0 --> 24.3.1-pyh8b19718_0
Downloading and Extracting Packages:
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
Installed Mambaforge-23.10.0-0-MacOSX-x86_64.sh to /Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0
:~/pyenv_conda_environment$ 空のgemoma環境の作成
:~/pyenv_conda_environment$ mamba create -n gemoma -y
Looking for: []
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
To activate this environment, use
$ mamba activate gemoma
To deactivate an active environment, use
$ mamba deactivate
:~/pyenv_conda_environment$ この後、シェルをリセット。
gemoma_profileを作成
### ~/pyenv_conda_environment/gemoma_profileの中身
source ~/.bash_profile
source ~/pyenv_conda_environment/pyenv_profile
pyenv global mambaforge-23.10.0-0
# >>> conda initialize >>>
# !! Contents within this block are managed by 'conda init' !!
__conda_setup="$('/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/bin/conda' 'shell.bash' 'hook' 2> /dev/null)"
if [ $? -eq 0 ]; then
eval "$__conda_setup"
else
if [ -f "/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/etc/profile.d/conda.sh" ]; then
. "/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/etc/profile.d/conda.sh"
else
export PATH="/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/bin:$PATH"
fi
fi
unset __conda_setup
if [ -f "/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/etc/profile.d/mamba.sh" ]; then
. "/Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/etc/profile.d/mamba.sh"
fi
# <<< conda initialize <<<
conda activate gemomaGeMoMaのインストール
(gemoma) :~/pyenv_conda_environment$ mamba install -c conda-forge -c bioconda gemoma=1.9 -y
Looking for: ['gemoma=1.9']
warning libmamba Could not parse mod/etag header
warning libmamba Could not parse mod/etag header
bioconda/osx-64 (check zst) Checked 0.4s
bioconda/noarch (check zst) Checked 0.0s
bioconda/osx-64 3.9MB @ 9.0MB/s 0.5s
bioconda/noarch 4.4MB @ 8.1MB/s 0.6s
conda-forge/noarch 17.7MB @ 18.6MB/s 1.0s
conda-forge/osx-64 35.5MB @ 28.6MB/s 1.3s
Transaction
Prefix: /Users/kosukesano/.pyenv/versions/mambaforge-23.10.0-0/envs/gemoma
Updating specs:
- gemoma=1.9
.
.
.
.
.(gemoma) :~/pyenv_conda_environment$ GeMoMa -h
Searching for the new GeMoMa updates ...
You are using the latest GeMoMa version.
This jar allows to run all parts of GeneModelMapper (GeMoMa) except the external search algorithm (e.g. tblastn).
For more information please visit http://www.jstacs.de/index.php/GeMoMa
If you have any questions, comments or bugs, please check FAQs on our homepage, our github page https://github.com/Jstacs/Jstacs/labels/GeMoMa or contact jens.keilwagen@julius-kuehn.de
If you use this tool, please cite
.
.
.
.
.できた。
ローカルでのGeMoMa実行
#$ -S /bin/bash
#$ -cwd
echo start at
date
source /Users/kosukesano/pyenv_conda_environment/gemoma_profile
GeMoMa GeMoMaPipeline \
t=/Users/kosukesano/bio/231117_madaragenome.fasta \
r=NO \
o=true \
i=Tcas \
a=/Users/kosukesano/bio/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/Users/kosukesano/bio/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=1 \
outdir=/Users/kosukesano/bio/for_gemoma/241212/madara_out
echo end at
dateこれをshで実行した。
1213
scorpionでのGeMoMa実行
#$ -S /bin/bash
#$ -cwd
echo start at
date
# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa -Xmx100g GeMoMaPipeline t=/home/dendezia/tool/for_gemoma/nama_data/231117_madaragenome.fasta r=NO o=true i=Tcas a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna GeMoMa.Score=ReAlign AnnotationFinalizer.r=NO threads=4
echo end at
dateこれを投げた。改行入れてたら謎のエラーが出て、改行を消したら直った。
同じものをローカルでも動かしておいてる。
デバッグ用にスクリプトもう一個書いた。
### 241213_gemoma_debug.sh
#$ -S /bin/bash
#$ -cwd
echo start at
date
# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa \
-Xmx100g \
GeMoMaPipeline \
t=/home/dendezia/tool/for_gemoma/nama_data/231117_madaragenome.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=10 \
outdir=/home/dendezia/tool/for_gemoma/241204_test/madara_out_debug
echo end at
dateこれを以下のコマンドで投げた
(gemoma) dendezia@scorpion:~/tool/for_gemoma/241204_test$ qsub -l ncpus=10 241213_gemoma_debug.sh
2075.scorpion
(gemoma) dendezia@scorpion:~/tool/for_gemoma/241204_test$ 1216
GeMoMa結果
ローカルで実行した方
=============================
Starting: SyntenyChecker (32974.065s)
Finished: SyntenyChecker (32974.407s)
Starting: Extractor for final prediction (32974.407s)
Finished: Extractor for final prediction (33062.396s)
Statistics:
Job WAITING RUNNING INTERRUPTED FAILED SUCCEEDED
---------------------------------------------------------
MmseqsCreateDB 0 0 0 0 1
EREAndFill 0 0 0 0 1
ExtractorAndSplit 0 0 0 0 1
Mmseqs 0 0 0 0 1
GeMoMa 0 0 0 0 1
Cat 0 0 0 0 1
GAF 0 0 0 0 1
AnnotationFinalizer 0 0 0 0 1
Extractor 0 0 0 0 1
SyntenyChecker 0 0 0 0 1
No errors detected.
Elapsed time: 33066 seconds (9h 11m 6s)
end at
2024年 12月13日 金曜日 23時51分28秒 JST
(gemoma) :~/bio/for_gemoma/241212$ ls
GeMoMa_temp madara_geoma.sh madara_out
(gemoma) :~/bio/for_gemoma/241212$ ls madara_
ls: madara_: No such file or directory
(gemoma) :~/bio/for_gemoma/241212$ ls madara_out/
final_annotation.gff protocol_GeMoMaPipeline.txt unfiltered_predictions_from_species_0.gff
predicted_proteins.fasta reference_gene_table.tabularこれ行けたんじゃね?
### final_annotation.gff の中身
##gff-version 3
#SOFTWARE INFO: GeMoMaPipeline 1.9; SIMPLE PARAMETERS: species: own; ID: Tcas; weight: 1.0; tblastn: false; tag: mRNA; RNA-seq evidence: NO; denoise: DENOISE; DenoiseIntrons.maximum intron length: 15000; DenoiseIntrons.minimum expression: 0.01; DenoiseIntrons.context: 10; Extractor.upcase IDs: false; Extractor.repair: false; Extractor.Ambiguity: AMBIGUOUS; Extractor.discard pre-mature stop: true; Extractor.stop-codon excluded from CDS: false; Extractor.full-length: true; GeMoMa.reads: 1; GeMoMa.splice: true; GeMoMa.gap opening: 11; GeMoMa.gap extension: 1; GeMoMa.maximum intron length: 15000; GeMoMa.static intron length: true; GeMoMa.intron-loss-gain-penalty: 25; GeMoMa.reduction factor: 10; GeMoMa.e-value: 100.0; GeMoMa.contig threshold: 0.4; GeMoMa.hit threshold: 0.9; GeMoMa.output: STATIC; GeMoMa.predictions: 10; GeMoMa.avoid stop: true; GeMoMa.approx: true; GeMoMa.protein alignment: true; GeMoMa.verbose: false; GeMoMa.timeout: 3600; GeMoMa.replace unknown: false; GeMoMa.Score: ReAlign; GAF.default attributes: tie,tde,tae,iAA,pAA,score,lpm,maxGap,bestScore,maxScore,raa,rce; GAF.kmeans: NO; GAF.filter: start=='M' and stop=='*' and (isNaN(score) or score/aa>=0.75); GAF.sorting: sumWeight,score,aa; GAF.alternative transcript filter: tie==1 or sumWeight>1; GAF.common border filter: 0.75; GAF.maximal number of transcripts per gene: 2147483647; GAF.add alternative transcripts: false; GAF.transfer features: false; AnnotationFinalizer.transfer features: false; AnnotationFinalizer.UTR: NO; AnnotationFinalizer.rename: NO; AnnotationFinalizer.name attribute: true; synteny check: true; predicted proteins: true; predicted CDSs: false; predicted genomic regions: false; output individual predictions: true; debug: true; restart: false; BLAST_PATH: ; MMSEQS_PATH:
##sequence-region ptg000128l_length_99247 1 99247
ptg000128l_length_99247 GAF gene 17016 17514 . + . ID=gene_10435;transcripts=1;complete=1
ptg000128l_length_99247 GeMoMa mRNA 17016 17514 . + . ID=Tcas_rna-XM_969231.3_R0;ref-gene=Tcas_gene-LOC663173;aa=94;raa=93;score=408;prediction=0;bestScore=408;ce=2;rce=2;pAA=0.8723;iAA=0.8404;lpm=30;maxScore=487;maxGap=1;nps=0;start=M;stop=*;evidence=1;Parent=gene_10435;sumWeight=1.0;
ptg000128l_length_99247 GeMoMa CDS 17016 17126 . + 0 Parent=Tcas_rna-XM_969231.3_R0
ptg000128l_length_99247 GeMoMa CDS 17344 17514 . + 0 Parent=Tcas_rna-XM_969231.3_R0
ptg000128l_length_99247 GAF gene 18035 46865 . + . ID=gene_10436;transcripts=1;complete=1
ptg000128l_length_99247 GeMoMa mRNA 18035 46865 . + . ID=Tcas_rna-XM_968720.3_R0;ref-gene=Tcas_gene-LOC662636;aa=679;raa=677;score=2580;prediction=0;bestScore=2580;ce=11;rce=8;pAA=0.8175;iAA=0.7372;lpm=59;maxScore=3632;maxGap=4;nps=0;start=M;stop=*;evidence=1;Parent=gene_10436;sumWeight=1.0;
ptg000128l_length_99247 GeMoMa CDS 18035 18172 . + 0 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 24694 25008 . + 0 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 29089 29349 . + 0 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 29424 29556 . + 0 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 33893 34125 . + 2 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 42340 42590 . + 0 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 42654 42782 . + 1 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 42849 42975 . + 1 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 46296 46458 . + 0 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 46521 46669 . + 2 Parent=Tcas_rna-XM_968720.3_R0
ptg000128l_length_99247 GeMoMa CDS 46728 46865 . + 0 Parent=Tcas_rna-XM_968720.3_R0
.
.
.
.
.GINGERのインストール
scorpionで行った。
GINGERはアノテーション統合ツールの1つで、EvidenceModelerの代替となりうる(?)ツール
(base) dendezia@scorpion:~/tool$ mamba create -n ginger -y
__ __ __ __
/ \ / \ / \ / \
/ \/ \/ \/ \
███████████████/ /██/ /██/ /██/ /████████████████████████
/ / \ / \ / \ / \ \____
/ / \_/ \_/ \_/ \ o \__,
/ _/ \_____/ `
|/
███╗ ███╗ █████╗ ███╗ ███╗██████╗ █████╗
████╗ ████║██╔══██╗████╗ ████║██╔══██╗██╔══██╗
██╔████╔██║███████║██╔████╔██║██████╔╝███████║
██║╚██╔╝██║██╔══██║██║╚██╔╝██║██╔══██╗██╔══██║
██║ ╚═╝ ██║██║ ██║██║ ╚═╝ ██║██████╔╝██║ ██║
╚═╝ ╚═╝╚═╝ ╚═╝╚═╝ ╚═╝╚═════╝ ╚═╝ ╚═╝
mamba (1.1.0) supported by @QuantStack
GitHub: https://github.com/mamba-org/mamba
Twitter: https://twitter.com/QuantStack
█████████████████████████████████████████████████████████████
Looking for: []
Preparing transaction: done
Verifying transaction: done
Executing transaction: done
To activate this environment, use
$ mamba activate ginger
To deactivate an active environment, use
$ mamba deactivate
(base) dendezia@scorpion:~/tool$ mamba activate ginger
(ginger) dendezia@scorpion:~/tool$ condaで入れようとしたけど、依存ツールが無限に入らず、断念。
gemoma結果のBUSCO
scorpion環境で行った。
### buscoのインストール
(base) dendezia@scorpion:~/tool$ mamba create -n busco -y
(base) dendezia@scorpion:~/tool$ mamba activate busco
(busco) dendezia@scorpion:~/tool$ mamba install -c conda-forge -c bioconda busco=5.8.1 -y### busco.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
source /home/dendezia/tool/pyenv_env/busco_profile
busco\
-m protein\
-i /home/dendezia/tool/for_gemoma/241204_test/madara_out_debug/predicted_proteins.fasta\
-o /home/dendezia/tool/for_gemoma/241204_test/madara_out_debug/BUSCO_OUTPUT_GEMOMA\
-l arthropoda_odb10/\
-f
echo end at
date結果はこんな感じ
### 241204_test/madara_out_debug/busco.sh.o2077の中身
start at
Mon Dec 16 07:05:13 2024
2024-12-16 07:05:47 INFO: ***** Start a BUSCO v5.8.1 analysis, current time: 12/16/2024 07:05:47 *****
2024-12-16 07:05:47 INFO: Configuring BUSCO with local environment
2024-12-16 07:05:47 INFO: Running proteins mode
2024-12-16 07:05:47 INFO: Downloading information on latest versions of BUSCO data...
2024-12-16 07:05:48 INFO: Download connection problem. Retrying in 10 seconds
2024-12-16 07:05:58 INFO: Download connection problem. Retrying in 100 seconds
2024-12-16 07:07:40 INFO: Input file is /home/dendezia/tool/for_gemoma/241204_test/madara_out_debug/predicted_proteins.fasta
2024-12-16 07:07:40 INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/arthropoda_odb10.2024-01-08.tar.gz'
2024-12-16 07:07:49 INFO: Decompressing file '/misc/home/dendezia/busco_downloads/lineages/arthropoda_odb10.tar.gz'
2024-12-16 07:08:06 INFO: Running BUSCO using lineage dataset arthropoda_odb10 (eukaryota, 2024-01-08)
2024-12-16 07:08:06 INFO: ***** Run HMMER on gene sequences *****
2024-12-16 07:08:06 INFO: Running 1013 job(s) on hmmsearch, starting at 12/16/2024 07:08:06
2024-12-16 07:08:34 INFO: [hmmsearch] 102 of 1013 task(s) completed
2024-12-16 07:08:56 INFO: [hmmsearch] 203 of 1013 task(s) completed
2024-12-16 07:09:14 INFO: [hmmsearch] 304 of 1013 task(s) completed
2024-12-16 07:09:32 INFO: [hmmsearch] 406 of 1013 task(s) completed
2024-12-16 07:09:49 INFO: [hmmsearch] 507 of 1013 task(s) completed
2024-12-16 07:10:23 INFO: [hmmsearch] 608 of 1013 task(s) completed
2024-12-16 07:10:55 INFO: [hmmsearch] 710 of 1013 task(s) completed
2024-12-16 07:11:23 INFO: [hmmsearch] 811 of 1013 task(s) completed
2024-12-16 07:11:47 INFO: [hmmsearch] 912 of 1013 task(s) completed
2024-12-16 07:12:13 INFO: [hmmsearch] 1013 of 1013 task(s) completed
2024-12-16 07:12:20 INFO:
---------------------------------------------------
|Results from dataset arthropoda_odb10 |
---------------------------------------------------
|C:78.3%[S:75.2%,D:3.1%],F:6.4%,M:15.3%,n:1013 |
|793 Complete BUSCOs (C) |
|762 Complete and single-copy BUSCOs (S) |
|31 Complete and duplicated BUSCOs (D) |
|65 Fragmented BUSCOs (F) |
|155 Missing BUSCOs (M) |
|1013 Total BUSCO groups searched |
---------------------------------------------------
2024-12-16 07:12:20 INFO: BUSCO analysis done. Total running time: 279 seconds
2024-12-16 07:12:20 INFO: Results written in home/dendezia/tool/for_gemoma/241204_test/madara_out_debug/BUSCO_OUTPUT_GEMOMA
2024-12-16 07:12:20 INFO: For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
2024-12-16 07:12:20 INFO: Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO
end at
Mon Dec 16 07:12:21 2024うーん低い。これはマスキングしていないからか、それともレファレンスが1種類だけだからか?
マスキングしたデータを使ったGeMoMa
241128にRepeatModelerのde novoデータベースとDfamのデータベースを結合したファイルを元にマスキングした結果がscorpionにあったので、これを使ってGeMoMaをやってみよう。
(busco) dendezia@scorpion:~/tool/for_RepeatMasker_Docker$ ls 241128_madara/output_dir/
231117_madaragenome.fasta.cat.gz 231117_madaragenome.fasta.masked 231117_madaragenome.fasta.out 231117_madaragenome.fasta.out.gff 231117_madaragenome.fasta.tbl
(busco) dendezia@scorpion:~/tool/for_RepeatMasker_Docker/241128_madara/output_dir$ cp 231117_madaragenome.fasta.masked ~/tool/for_gemoma/nama_data/241128_madara_masked.fasta
(busco) dendezia@scorpion:~/tool/for_RepeatMasker_Docker/241128_madara/output_dir$ ~/tool/for_gemoma/241216を作成、その下で241216_madara_gemoma.shを書いた。
### 241216_madara_gemoma.shの中身
#$ -S /bin/bash
#$ -cwd
echo start at
date
# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa \
-Xmx100g \
GeMoMaPipeline \
t=/home/dendezia/tool/for_gemoma/nama_data/241128_madara_masked.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=10 \
outdir=/home/dendezia/tool/for_gemoma/241216/241216_madara_out
echo end at
dateqsubで投げた
1217
マスキングしたデータを使ったGeMoMa結果
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$ ls 241216_madara_out/
final_annotation.gff predicted_proteins.fasta protocol_GeMoMaPipeline.txt reference_gene_table.tabular unfiltered_predictions_from_species_0.gff
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$できてる。
これをbuscoにかけたらこんな感じ
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$ busco -m protein -i 241216_madara_out/predicted_proteins.fasta -o BUSCO_OUTPUT_GEMOMA -l arthropoda_odb10 -f
2024-12-17 10:19:46 INFO: ***** Start a BUSCO v5.8.1 analysis, current time: 12/17/2024 10:19:46 *****
2024-12-17 10:19:46 INFO: Configuring BUSCO with local environment
2024-12-17 10:19:46 INFO: Running proteins mode
2024-12-17 10:19:46 INFO: Downloading information on latest versions of BUSCO data...
2024-12-17 10:19:50 INFO: Input file is /home/dendezia/tool/for_gemoma/241216/241216_madara_out/predicted_proteins.fasta
2024-12-17 10:19:50 INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/arthropoda_odb10.2024-01-08.tar.gz'
2024-12-17 10:19:59 INFO: Decompressing file '/home/dendezia/tool/for_gemoma/241216/busco_downloads/lineages/arthropoda_odb10.tar.gz'
2024-12-17 10:20:02 INFO: Running BUSCO using lineage dataset arthropoda_odb10 (eukaryota, 2024-01-08)
2024-12-17 10:20:02 INFO: ***** Run HMMER on gene sequences *****
2024-12-17 10:20:02 INFO: Running 1013 job(s) on hmmsearch, starting at 12/17/2024 10:20:02
2024-12-17 10:20:17 INFO: [hmmsearch] 102 of 1013 task(s) completed
2024-12-17 10:20:35 INFO: [hmmsearch] 203 of 1013 task(s) completed
2024-12-17 10:20:48 INFO: [hmmsearch] 304 of 1013 task(s) completed
2024-12-17 10:20:59 INFO: [hmmsearch] 406 of 1013 task(s) completed
2024-12-17 10:21:10 INFO: [hmmsearch] 507 of 1013 task(s) completed
2024-12-17 10:21:42 INFO: [hmmsearch] 608 of 1013 task(s) completed
2024-12-17 10:22:13 INFO: [hmmsearch] 710 of 1013 task(s) completed
2024-12-17 10:22:37 INFO: [hmmsearch] 811 of 1013 task(s) completed
2024-12-17 10:22:58 INFO: [hmmsearch] 912 of 1013 task(s) completed
2024-12-17 10:23:20 INFO: [hmmsearch] 1013 of 1013 task(s) completed
2024-12-17 10:23:21 INFO:
---------------------------------------------------
|Results from dataset arthropoda_odb10 |
---------------------------------------------------
|C:78.3%[S:75.2%,D:3.1%],F:6.4%,M:15.3%,n:1013 |
|793 Complete BUSCOs (C) |
|762 Complete and single-copy BUSCOs (S) |
|31 Complete and duplicated BUSCOs (D) |
|65 Fragmented BUSCOs (F) |
|155 Missing BUSCOs (M) |
|1013 Total BUSCO groups searched |
---------------------------------------------------
2024-12-17 10:23:21 INFO: BUSCO analysis done. Total running time: 211 seconds
2024-12-17 10:23:21 INFO: Results written in /home/dendezia/tool/for_gemoma/241216/BUSCO_OUTPUT_GEMOMA
2024-12-17 10:23:21 INFO: For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
2024-12-17 10:23:21 INFO: Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$seqkitの結果はこう
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$ seqkit stat 241216_madara_out/predicted_proteins.fasta
file format type num_seqs sum_len min_len avg_len max_len
241216_madara_out/predicted_proteins.fasta FASTA Protein 10,463 4,799,632 31 458.7 23,673
(busco) dendezia@scorpion:~/tool/for_gemoma/241216$ ソフトマスクの有無で結果が完全に同じ。GeMoMaはマスキングを認識しない?
RNA-seqデータを追加したGeMoMa
奏子先生に頂いたmerged_madara.zipをscorpionに転送
(gemoma) :~/bio/for_gemoma/241212$ scp /Volumes/Elements_1/merged_madara.zip dendezia@scorpion:/home/dendezia/tool/for_gemoma/nama_data
Host key fingerprint is SHA256:KPa37JYErRVG/1YWy31gMOwAs13hHzUeg3opGD75qVY
+--[ED25519 256]--+
| .+. .=o=+.|
| o*.o.=.*+|
| oo.*oo B.o|
| ..o= +.* ..|
| o .+S o * . |
| . o. . E |
| ....o |
| oo+ |
| o= |
+----[SHA256]-----+
merged_madara.zip 100% 6110MB 107.5MB/s 00:56
(gemoma) :~/bio/for_gemoma/241212$ 解凍
(gemoma) dendezia@scorpion:~/tool/for_gemoma/nama_data$ ls
231117_madaragenome.fasta 241128_madara_masked.fasta merged_madara.zip reference
(gemoma) dendezia@scorpion:~/tool/for_gemoma/nama_data$ unzip merged_madara.zip
Archive: merged_madara.zip
inflating: merged_madara.bam
(gemoma) dendezia@scorpion:~/tool/for_gemoma/nama_data$ ls
231117_madaragenome.fasta 241128_madara_masked.fasta merged_madara.bam merged_madara.zip reference
(gemoma) dendezia@scorpion:~/tool/for_gemoma/nama_data$ ~/tool/for_gemoma/241217を作成し、以下で241217_plusRNA.shを記述、qsubで投げた。
### 241217_plusRNA.sh
#$ -S /bin/bash
#$ -cwd
echo start at
date
# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa \
-Xmx100g \
GeMoMaPipeline \
t=/home/dendezia/tool/for_gemoma/nama_data/241128_madara_masked.fasta \
o=true \
i=Tcas \
a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=10 \
outdir=/home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out \
r=MAPPED \
ERE.s=FR_FIRST_STRAND \
ERE.m=/home/dendezia/tool/for_gemoma/nama_data/merged_madara.bam \
echo end at
date2種のゲノムをレファレンスにしたGeMoMa
RNA-seqのデータ入れていない。
#$ -S /bin/bash
#$ -cwd
echo start at
date
# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa \
-Xmx100g \
GeMoMaPipeline \
t=/home/dendezia/tool/for_gemoma/nama_data/241128_madara_masked.fasta \
r=NO \
o=true \
s=own \
i=Tcas \
a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
s=own \
i=Hsap \
a=/home/dendezia/tool/for_gemoma/nama_data/reference/Homo_sapiens/ncbi_dataset/data/GCF_000001405.40/genomic.gff \
g=/home/dendezia/tool/for_gemoma/nama_data/reference/Homo_sapiens/ncbi_dataset/data/GCF_000001405.40/GCF_000001405.40_GRCh38.p14_genomic.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=10 \
outdir=/home/dendezia/tool/for_gemoma/241217/241217_2sp_out
echo end at
date1218
RNA-seqも入れたGeMoMa
(gemoma) dendezia@scorpion:~/tool/for_gemoma/241217$ ls 241217_plusRNA_out/
final_annotation.gff predicted_proteins.fasta protocol_GeMoMaPipeline.txt reference_gene_table.tabular unfiltered_predictions_from_species_0.gff
(gemoma) dendezia@scorpion:~/tool/for_gemoma/241217$できてそう。
(busco) dendezia@scorpion:~/tool/for_gemoma/241217/241217_plusRNA_out$ seqkit stat predicted_proteins.fasta
file format type num_seqs sum_len min_len avg_len max_len
predicted_proteins.fasta FASTA Protein 10,463 4,799,632 31 458.7 23,673
(busco) dendezia@scorpion:~/tool/for_gemoma/241217/241217_plusRNA_out$
(busco) dendezia@scorpion:~/tool/for_gemoma/241217/241217_plusRNA_out$ busco -m protein -i predicted_proteins.fasta -o BUSCO_OUTPUT_GEMOMA -l arthropoda_odb10 -f
2024-12-18 11:19:06 INFO: ***** Start a BUSCO v5.8.1 analysis, current time: 12/18/2024 11:19:06 *****
2024-12-18 11:19:06 INFO: Configuring BUSCO with local environment
2024-12-18 11:19:06 INFO: Running proteins mode
2024-12-18 11:19:06 INFO: Downloading information on latest versions of BUSCO data...
2024-12-18 11:19:10 INFO: Input file is /home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out/predicted_proteins.fasta
2024-12-18 11:19:10 INFO: Downloading file 'https://busco-data.ezlab.org/v5/data/lineages/arthropoda_odb10.2024-01-08.tar.gz'
2024-12-18 11:19:19 INFO: Decompressing file '/home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out/busco_downloads/lineages/arthropoda_odb10.tar.gz'
2024-12-18 11:19:22 INFO: Running BUSCO using lineage dataset arthropoda_odb10 (eukaryota, 2024-01-08)
2024-12-18 11:19:22 INFO: ***** Run HMMER on gene sequences *****
2024-12-18 11:19:22 INFO: Running 1013 job(s) on hmmsearch, starting at 12/18/2024 11:19:22
2024-12-18 11:19:38 INFO: [hmmsearch] 102 of 1013 task(s) completed
2024-12-18 11:19:56 INFO: [hmmsearch] 203 of 1013 task(s) completed
2024-12-18 11:20:08 INFO: [hmmsearch] 304 of 1013 task(s) completed
2024-12-18 11:20:21 INFO: [hmmsearch] 406 of 1013 task(s) completed
2024-12-18 11:20:31 INFO: [hmmsearch] 507 of 1013 task(s) completed
2024-12-18 11:21:04 INFO: [hmmsearch] 608 of 1013 task(s) completed
2024-12-18 11:21:35 INFO: [hmmsearch] 710 of 1013 task(s) completed
2024-12-18 11:22:00 INFO: [hmmsearch] 811 of 1013 task(s) completed
2024-12-18 11:22:20 INFO: [hmmsearch] 912 of 1013 task(s) completed
2024-12-18 11:22:42 INFO: [hmmsearch] 1013 of 1013 task(s) completed
2024-12-18 11:22:44 INFO:
---------------------------------------------------
|Results from dataset arthropoda_odb10 |
---------------------------------------------------
|C:78.3%[S:75.2%,D:3.1%],F:6.4%,M:15.3%,n:1013 |
|793 Complete BUSCOs (C) |
|762 Complete and single-copy BUSCOs (S) |
|31 Complete and duplicated BUSCOs (D) |
|65 Fragmented BUSCOs (F) |
|155 Missing BUSCOs (M) |
|1013 Total BUSCO groups searched |
---------------------------------------------------
2024-12-18 11:22:44 INFO: BUSCO analysis done. Total running time: 214 seconds
2024-12-18 11:22:44 INFO: Results written in /home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out/BUSCO_OUTPUT_GEMOMA
2024-12-18 11:22:44 INFO: For assistance with interpreting the results, please consult the userguide: https://busco.ezlab.org/busco_userguide.html
2024-12-18 11:22:44 INFO: Visit this page https://gitlab.com/ezlab/busco#how-to-cite-busco to see how to cite BUSCO予測結果完全に同じなんだが……。なぜえ?
### 241217_plusRNA.sh.e2081の中身
sc - synteny check (run SyntenyChecker if possible, default = true) = true
p - predicted proteins (If *true*, returns the predicted proteins of the target organism as fastA file, default = true) = true
pc - predicted CDSs (If *true*, returns the predicted CDSs of the target organism as fastA file, default = false) = false
pgr - predicted genomic regions (If *true*, returns the genomic regions of predicted gene models of the target organism as fastA file, default = false) = false
o - output individual predictions (If *true*, returns the predictions for each reference species, default = false) = true
debug - debug (If *false* removes all temporary files even if the jobs exits unexpected, default = true) = true
restart - restart (can be used to restart the latest GeMoMaPipeline run, which was finished without results, with very similar parameters, e.g., after an exception was thrown (cf. parameter debug), default = false) = false
b - BLAST_PATH (allows to set a path to the blast binaries if not set in the environment, default = , OPTIONAL) =
m - MMSEQS_PATH (allows to set a path to the blast binaries if not set in the environment, default = , OPTIONAL) =
outdir - The output directory, defaults to the current working directory (.) = /home/dendezia/tool/for_gemoma/241217/241217_plusRNA_out
threads - The number of threads used for the tool, defaults to 1 = 10
[mmseqs]: 16.747c6
java.lang.IllegalStateException: Records A00718:237:HMTKWDSXY:3:1101:1217:1031 (ptg000178c_length_56031:54,907) should come after A00718:237:HMTKWDSXY:3:1101:1217:1031 (ptg000178c_length_56031:54,828) when sorting with htsjdk.samtools.SAMRecordCoordinateComparator
at htsjdk.samtools.SamReader$AssertingIterator.next(SamReader.java:549)
at htsjdk.samtools.SamReader$AssertingIterator.next(SamReader.java:519)
at projects.gemoma.ExtractRNAseqEvidence.run(ExtractRNAseqEvidence.java:526)
at projects.gemoma.GeMoMaPipeline$JEREAndFill.doJob(GeMoMaPipeline.java:1539)
at projects.gemoma.GeMoMaPipeline$FlaggedRunnable.run(GeMoMaPipeline.java:1375)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
Check RNA-seq data (introns): 0% of the sequences in the reference genome are covered.
Check RNA-seq data (forward coverage): 0% of the sequences in the reference genome are covered.
Check RNA-seq data (reverse coverage): 0% of the sequences in the reference genome are covered.
Warning: Nashorn engine is planned to be removed from a future JDK releaseどうも.bamの中身がよくないらしい。GeMoMaに沿うように作り直す必要がありそう。
2種の参照配列を入れたGeMoMa
### 241217_2sp.sh.e2082 の中身
threads - The number of threads used for the tool, defaults to 1 = 10
[mmseqs]: 16.747c6
[mmseqs]: Segmentation fault (core dumped)
[mmseqs]: Error: Prefilter died
[mmseqs]: Error: Search step died
java.lang.InterruptedException
at java.base/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2109)
at java.base/java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1454)
at projects.gemoma.GeMoMaPipeline$1.run(GeMoMaPipeline.java:609)
at projects.gemoma.GeMoMaPipeline$FlaggedRunnable.run(GeMoMaPipeline.java:1409)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
11 jobs did not finish as expected. Please check the output carefully.
Did not delete temporary files allowing to debug.
java.lang.InterruptedException
at java.base/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.reportInterruptAfterWait(AbstractQueuedSynchronizer.java:2056)
at java.base/java.util.concurrent.locks.AbstractQueuedSynchronizer$ConditionObject.awaitNanos(AbstractQueuedSynchronizer.java:2133)
at java.base/java.util.concurrent.ThreadPoolExecutor.awaitTermination(ThreadPoolExecutor.java:1454)
at projects.gemoma.GeMoMaPipeline$1.run(GeMoMaPipeline.java:609)
at projects.gemoma.GeMoMaPipeline$FlaggedRunnable.run(GeMoMaPipeline.java:1409)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at java.base/java.util.concurrent.Executors$RunnableAdapter.call(Executors.java:515)
at java.base/java.util.concurrent.FutureTask.run(FutureTask.java:264)
at java.base/java.util.concurrent.ThreadPoolExecutor.runWorker(ThreadPoolExecutor.java:1128)
at java.base/java.util.concurrent.ThreadPoolExecutor$Worker.run(ThreadPoolExecutor.java:628)
at java.base/java.lang.Thread.run(Thread.java:829)
Exception in thread "main" java.lang.RuntimeException: Did not finish as intended.
at projects.gemoma.GeMoMaPipeline.run(GeMoMaPipeline.java:1234)
at projects.gemoma.GeMoMaModule.run(GeMoMaModule.java:94)
at de.jstacs.tools.ui.cli.CLI.run(CLI.java:426)
at projects.gemoma.GeMoMa.main(GeMoMa.java:399)メモリ不足(?)でMMseq2が止まったらしい。
メモリを明示してやってみる。
#$ -S /bin/bash
#$ -cwd
#PBS -l select=1:ncpus=10:mem=50gb
echo start at
date
# 環境の読み込み
source /home/dendezia/tool/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa \
-Xmx50g \
GeMoMaPipeline \
t=/home/dendezia/tool/for_gemoma/nama_data/241128_madara_masked.fasta \
r=NO \
o=true \
s=own \
i=Tcas \
a=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/dendezia/tool/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
s=own \
i=Hsap \
a=/home/dendezia/tool/for_gemoma/nama_data/reference/Homo_sapiens/ncbi_dataset/data/GCF_000001405.40/genomic.gff \
g=/home/dendezia/tool/for_gemoma/nama_data/reference/Homo_sapiens/ncbi_dataset/data/GCF_000001405.40/GCF_000001405.40_GRCh38.p14_genomic.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=10 \
outdir=/home/dendezia/tool/for_gemoma/241217/241217_2sp_out
echo end at
dateこれでもダメだったので、~/tool/for_gemoma/241218で10c200g.shと20c500g.shを作成し、実行した。
10c200g.shの方はダメだった。ちなみにヒトゲノムのデータセットが悪い説はないか?
1220
オジロのソフトマスクやり直し
DfamのデータベースとRepeatModelerで作成したデータベースを結合させる。
kosukesano@at137:~/tools/for_RepeatMasker_Docker/nama_data$ cat Dfam-RepeatMasker.lib ~/tools/for_softmask/Ojiro_softmask/RM_3181478.TueOct151949192024/consensi.fa.classified > 241220_for_ojiro.lib
kosukesano@at137:~/tools/for_RepeatMasker_Docker/nama_data$ ls
231117_madaragenome.fasta 241128_for_madara.lib 241220_for_ojiro.lib Dfam-RepeatMasker.lib
kosukesano@at137:~/tools/for_RepeatMasker_Docker/nama_data$~/tools/for_RepeatMasker_Docker/241220_ojiroを作成し、その下でojiro_softmask.shを実行する。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
RepeatMasker\
-pa 6\
-s\
-lib /home/kosukesano/tools/for_RepeatMasker_Docker/nama_data/241220_for_ojiro.lib\
-dir /home/kosukesano/tools/for_RepeatMasker_Docker/241220_ojiro/output_dir\
-xsmall\
-gff\
/home/kosukesano/tools/for_softmask/nama_data/Release_241005-ojiro_hifiasm/out.p_ctg.fa
echo end at
date1226
オジロのソフトマスク結果
>ptg000001l
TAGCAGTATCGAGTATAATCATAATATCGTAGTTTTATTGCTAAAACTGT
CCTTTCAACTAATAGTTAGGTATAGATATTCACATATGCATTTTCATTTT
TAAATAAATCTTCGATACTCTGTAATCAATTTCCATTTTTGTTCTATCCC
AAATTATATAAAGTATATAATTTTCTATGTTTTTTTGGTGGAGTGTTCGC
AAAGGGCTGTGACTTGAAGGATGCGTCTTAATCTCGAGGAATATAATGAA
GCAAATGTATCTGCATTAATCTTCTTCTATCTAGTGAGTTGAAATATAAT
GTGGGGTATTATAACAATGACGCAGTAGTAAATAAAAATAAATCAAATCG
ACTTACGTCGATATAAAGTATACTAATTAAAAACATAAAGTCAATCTCGC
AAAAGCAAATATAAGTTAATACATATTAGATATAAATTTGTCCAGATATA
TTAAAATGGCTATTAGTCATTTCTTGACACGGGAtaattaataattaatt
tttcattaaattaaCATACTAAGAAAAACCAGACATCAGACCCAGTTGGT
TTTTCAACTGAAGTGAAACAGTAATCTTAAGCAAATATATCAATAATCTA
ATATGAATTCCTACAAAATTATCTGCTTGAACCTAGAACAAGCTATGCCT
GCGTATATAACTTTAACCAGTTAAGTGACTTCATGCATATATTACTATGA
TTTTAACACCTAATTAGCCTAATGGCTTCTGCTTATGTTCAAAAGATTAC
ATCTAAGTCGATTTTCTTCTCATCGTCATAAGAGGATTAAAATATTCAAA
TTAATAATATCCAGAATGATCAATAAATTAACAAACGAAATTTTAAATTG
CCGTTGATCTAATgtggtaaatgggtattatgtaatatttttcgacaggg
gtggtatgatcgagtaattcgTCAACTAGAAACTACAGTATATATTGTAT
CTGAGCTGAACGAAGTTacagggatatccatataaaagtaatgaatccta
ctttttattcttaaataaacgttatatataaaagttttggctattttgaa
acatttatatatcttacaaccaaaataattgtgcaacgaataatgaagta
gtataaaacatcgattttcttgcttacttaaattggacggtatggtttttできてそう
これをBRAKER3用の生データディレクトリにコピー。
kosukesano@at138:~/tools/for_braker/nama_data$ cp ~/tools/for_RepeatMasker_Docker/241220_ojiro/output_dir/out.p_ctg.fa.masked 241226_ojiro_masked.fa
kosukesano@at138:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta Dfro.fna Pst_NotUseEDTA_upper10000.fna femo_busco.sh.po26221930
241017_Ojiro_masked.fa Dval.fna Pst_NotUseEDTA_upper5000.fna kohuki_busco.sh
241120_madara_dfam.fasta Ekam_NotUseEDTA.fna Sfem_RNAseq kohuki_busco.sh.e26238968
241127_madara_DockerRM.fasta Ekam_oomoji.fna Sfem_pilon_softmasked.fasta kohuki_busco.sh.o26238968
241127_madara_dfam_RM_data_NotUsedBuildDB.fasta Elaeidobius_kamerunicus.masked.fna Sfem_softmasked.fasta kohuki_busco.sh.pe26238968
241127_madara_dfam_RMdata_buildDB.fasta GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz busco_downloads kohuki_busco.sh.po26238968
241129_madara_dfamplusbuilddb.fasta Madara_RNAseq femo_busco.sh kohuki_softmasked.fasta
241226_ojiro_masked.fa Ojiro_RNAseq femo_busco.sh.e26221930 kohuki_softmasked_upper1000.fasta
BUSCO_OUTPUT_FEMO_GENOME Pst_NotUseEDTA.fna femo_busco.sh.o26221930 length.txt
BUSCO_OUTPUT_KOHUKI_GENOME Pst_NotUseEDTA_upper1000.fna femo_busco.sh.pe26221930 madaralength.txt
kosukesano@at138:~/tools/for_braker/nama_data$ オジロのBRAKER3再実行
~/tools/for_braker/241226_ojiroを作成し、そこで以下のスクリプトを作成。
### ojiro_braker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 16
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/241226_ojiro_masked.fa\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=ojiro-female_1,ojiro-female_2,ojiro-male_1,ojiro-male_2,ojiro-larva_1,ojiro-larva_2,\
ojiro_E1_1,ojiro_E1_2,ojiro_E2_1,ojiro_E2_2,ojiro_E3_1,ojiro_E3_2,ojiro_E4_1,ojiro_E4_2,\
ojiro_H1_1,ojiro_H1_2,ojiro_H2_1,ojiro_H2_2,ojiro_H3_1,ojiro_H3_2,ojiro_H4_1,ojiro_H4_2,\
ojiro_L1_1,ojiro_L1_2,ojiro_L2_1,ojiro_L2_2,ojiro_L3_1,ojiro_L3_2,ojiro_L4_1,ojiro_L4_2,\
ojiro_O1_1,ojiro_O1_2,ojiro_O2_1,ojiro_O2_2,ojiro_O3_1,ojiro_O3_2,ojiro_O4_1,ojiro_O4_2,\
ojiro_T1_1,ojiro_T1_2,ojiro_T2_1,ojiro_T2_2,ojiro_T3_1,ojiro_T3_2,ojiro_T4_1,ojiro_T4_2\
--rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Ojiro_RNAseq\
--threads=16\
--species=Ojiro_241226\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
echo end at
date遺伝研でのGeMoMa実行
~/tools/for_gemoma/241226ディレクトリを作成、以下でTcasのみをレファレンスとしたgemoma_tcas.shを作成した。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 10
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
# 環境の読み込み
source /home/kosukesano/tools/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa \
-Xmx100g \
GeMoMaPipeline \
t=/home/kosukesano//tools/for_braker/nama_data/231117_Madara_softmasked.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=10 \
outdir=/home/kosukesano/tools/for_gemoma/241226/tcas_out
echo end at
dateTcasとDmelをレファレンスにしたスクリプトgemoma_dmel_tcas.shも併せて実行した。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 10
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
# 環境の読み込み
source /home/kosukesano/tools/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa \
-Xmx100g \
GeMoMaPipeline \
t=/home/kosukesano//tools/for_braker/nama_data/231117_Madara_softmasked.fasta \
r=NO \
o=true \
s=own \
i=Tcas \
a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
s=own \
i=Dmel \
a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Dmel/genomic.gff \
g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Dmel/GCF_000001215.4_Release_6_plus_ISO1_MT_genomic.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=10 \
outdir=/home/kosukesano/tools/for_gemoma/241226/dmel_tcas_out
echo end at
date2025年1月
0107
GeMoMa続き
遺伝研のジョブがqwのまま進まない。
とりあえずintelでも入れてみる。
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 10
#$ -l s_vmem=64G
#$ -l mem_req=64G
echo start at
date
# 環境の読み込み
source /home/kosukesano/tools/pyenv_env/gemoma_profile
# GeMoMaPipelineの実行
GeMoMa \
-Xmx64g \
GeMoMaPipeline \
t=/home/kosukesano//tools/for_braker/nama_data/231117_Madara_softmasked.fasta \
r=NO \
o=true \
i=Tcas \
a=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas_genomic.gff \
g=/home/kosukesano/tools/for_gemoma/nama_data/reference/Tcas/Tcas.fna \
GeMoMa.Score=ReAlign \
AnnotationFinalizer.r=NO \
threads=10 \
outdir=/home/kosukesano/tools/for_gemoma/250107/tcas_out
echo end at
dateSSHの設定変更
(base) :~/Desktop/notebook$ ssh-keygen -R gw.ddbj.nig.ac.jp
# Host gw.ddbj.nig.ac.jp found: line 1
# Host gw.ddbj.nig.ac.jp found: line 5
/Users/kosukesano/.ssh/known_hosts updated.
Original contents retained as /Users/kosukesano/.ssh/known_hosts.old
(base) :~/Desktop/notebook$ 0110
GINGERのインストール
Dockerイメージを使って入れた。Docker pullを使うとエラー吐かれるので注意
### 失敗例
kosukesano@at139:~/tools/for_ginger$ docker pull i10labtitech/tools:GINGER_v1.0.1
Cannot connect to the Docker daemon at unix:///var/run/docker.sock. Is the docker daemon running?
kosukesano@at139:~/tools/for_ginger$Dockerのdaemonが悪さをするらしい。
### 多分うまくいったやつ?
kosukesano@at139:~/tools/for_ginger$ apptainer pull docker://i10labtitech/tools:GINGER_v1.0.1
INFO: Converting OCI blobs to SIF format
INFO: Starting build...
Getting image source signatures
Copying blob 482bd95e477d done |
Copying blob 1bc677758ad7 done |
Copying config 6590a7c3cd done |
Writing manifest to image destination
2025/01/10 17:22:58 info unpack layer: sha256:1bc677758ad7fa4503417ae5be18809c5a8679b5b36fcd1464d5a8e41cb13305
2025/01/10 17:22:59 info unpack layer: sha256:482bd95e477d05637df1423052f7034a88d402126f6ec0a2ae7a6165e9891dab
2025/01/10 17:25:01 warn rootless{usr/local/src/trinityrnaseq-v2.15.0/trinity-plugins/Trimmomatic-0.36/trimmomatic.jar} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02 warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/Trimmomatic-0.36/trimmomatic.jar} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02 warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/bamsifter/htslib/build/lib/libhts.so} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02 warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/bamsifter/htslib/build/lib/libhts.so.3} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02 warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/bamsifter/htslib/htscodecs.mk} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
2025/01/10 17:25:02 warn rootless{usr/local/src/trinityrnaseq-v2.15.1/trinity-plugins/bamsifter/htslib/libhts.so.3} ignoring (usually) harmless EPERM on setxattr "user.rootlesscontainers"
INFO: Creating SIF file...
kosukesano@at139:~/tools/for_ginger$0121
コフキゾウムシのソフトマスク
コフキゾウムシゲノムの生データがこれ
kosukesano@at139:~/tools/for_softmask/kohuki_softmask$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta FASTA DNA 2,372,896 3,664,337,660 48 1,544.2 151,585 86 100 363 0 15,058 0 0 32.29
kosukesano@at139:~/tools/for_softmask/kohuki_softmask$ ゲノム全長が3Gbpで、短い配列がめちゃくちゃ多い。
1000bp未満を切り落とす
kosukesano@at139:~/tools/for_softmask/kohuki_softmask$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 1000 180927_4-kofukizoumushi_397M_supernova_v210_min500_raw.fasta > 250121_kohuki_upper1000.fasta
[WARN] you may switch on flag -g/--remove-gaps to remove spaces
kosukesano@at139:~/tools/for_softmask/kohuki_softmask$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a 250121_kohuki_upper1000.fasta
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
250121_kohuki_upper1000.fasta FASTA DNA 397,892 3,349,012,532 1,000 8,416.9 151,585 1,967 4,187 10,292 0 17,050 0 0 32.27
kosukesano@at139:~/tools/for_softmask/kohuki_softmask$めちゃくちゃコンティグが減った。これを使ってソフトマスクを行う。
~/tools/for_softmask/250121_Kohuki_softmaskディレクトリを作成、そこでまずBLASTデータベースを作る。
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Kohuki_softmask$ BuildDatabase -name Kohuki_denovo_DB ../kohuki_softmask/250121_kohuki_upper1000.fasta
Building database Kohuki_denovo_DB:
Reading ../kohuki_softmask/250121_kohuki_upper1000.fasta...
Number of sequences (bp) added to database: 397892 ( 3349012532 bp )
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Kohuki_softmask$ ls
Kohuki_RepeatModeler.sh Kohuki_denovo_DB.nin Kohuki_denovo_DB.nnd Kohuki_denovo_DB.nog Kohuki_denovo_DB.translation
Kohuki_denovo_DB.nhr Kohuki_denovo_DB.njs Kohuki_denovo_DB.nni Kohuki_denovo_DB.nsq
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Kohuki_softmask$ 続いてこれを元にRepeatModelerを実行する
### Kohuki_RepeatModeler.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
RepeatModeler -database Kohuki_denovo_DB -pa 6
echo end at
dateこれをqsubで投げた。
フェモラータのソフトマスク
フェモラータゲノムの生データがこれ
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a Sfem_assembly.fasta
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
Sfem_assembly.fasta FASTA DNA 5,084 495,627,753 26 97,487.8 7,760,786 1,769.5 3,258 6,975 0 1,228,127 0 0 36.71
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ 同じく1000bp未満を切る
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit seq -m 1000 Sfem_assembly.fasta > 250121_Sfem_upper1000.fasta
[WARN] you may switch on flag -g/--remove-gaps to remove spaces
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat -a 250121_Sfem_upper1000.fasta
file format type num_seqs sum_len min_len avg_len max_len Q1 Q2 Q3 sum_gap N50 Q20(%) Q30(%) GC(%)
250121_Sfem_upper1000.fasta FASTA DNA 4,530 495,258,564 1,001 109,328.6 7,760,786 2,211 3,625 12,603 0 1,228,127 0 0 36.71
(EDTA2) kosukesano@at138:~/tools/for_softmask/nama_data$そんなに変わらなかった。
~/tools/for_softmask/250121_Sfem_softmaskディレクトリを作成、そこでBLASTデータベースを作る。
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ BuildDatabase -name Sfem_denovo_DB ../nama_data/250121_Sfem_upper1000.fasta
Building database Sfem_denovo_DB:
Reading ../nama_data/250121_Sfem_upper1000.fasta...
Number of sequences (bp) added to database: 4530 ( 495258564 bp )
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ ls
Sfem_denovo_DB.nhr Sfem_denovo_DB.nin Sfem_denovo_DB.njs Sfem_denovo_DB.nnd Sfem_denovo_DB.nni Sfem_denovo_DB.nog Sfem_denovo_DB.nsq Sfem_denovo_DB.translation
(EDTA2) kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ 続いてこれを元にRepeatModelerを実行する
### Sfem_RepeatModeler.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l intel
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
source ~/tools/pyenv_env/EDTA_profile
;;pppppl/////
RepeatModeler -database Sfem_denovo_DB -pa 6
echo end at
dateこれをqsubで投げた。
0122
2種のソフトマスク
なんか全然入らなかったので、short設定にしてメモリ設定を全部3で割った値に直した。
0123
2種のソフトマスク
round-1で止まっちゃったので、gpuにしてメモリを戻してもう一回かけた。
GINGERのインストール(GitHubから直接落とす)
git cloneを使用してGitHubページから必要なファイル・ディレクトリを全て落とす。
kosukesano@at137:~/tools/for_ginger/250123_test$ git clone https://github.com/i10labtitech/GINGER.git
Cloning into 'GINGER'...
remote: Enumerating objects: 428, done.
remote: Counting objects: 100% (428/428), done.
remote: Compressing objects: 100% (237/237), done.
remote: Total 428 (delta 247), reused 366 (delta 185), pack-reused 0 (from 0)
Receiving objects: 100% (428/428), 1.10 MiB | 1.28 MiB/s, done.
Resolving deltas: 100% (247/247), done.
kosukesano@at137:~/tools/for_ginger/250123_test$こんなディレクトリができる。
kosukesano@at137:~/tools/for_ginger/250123_test$ ls
GINGER
kosukesano@at137:~/tools/for_ginger/250123_test$ ls GINGER/
AUTHORS CHANGES ChangeLog FAQ INSTALL LICENSE Makefile README VERSION generateSampleData_cel.pl nextflow.config.user pipeline runEvaluatePred.pl runGINGER.pl src util
kosukesano@at137:~/tools/for_ginger/250123_test$ GINGERディレクトリに入ってmakeコマンドを実行
kosukesano@at137:~/tools/for_ginger/250123_test/GINGER$ make
cd src/mapping && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/mapping'
g++ gff_trimmer.cpp -o gff_trimmer -std=c++0x -O3
g++ exon_num_filter.cpp -o exon_num_filter -std=c++0x -O3
g++ longest_transcript.cpp -o longest_transcript -std=c++0x -O3
g++ repeat_checker.cpp -o repeat_checker -std=c++0x -O3
g++ strand_replace.cpp -o strand_replace -std=c++0x -O3
g++ set_difference.cpp -o set_difference -std=c++0x -O3
g++ tag_trimmer.cpp -o tag_trimmer -std=c++0x -O3
g++ ORF_finder.cpp -o ORF_finder -std=c++0x -O3
install -s \
gff_trimmer exon_num_filter longest_transcript \
repeat_checker strand_replace set_difference \
tag_trimmer ORF_finder ../../util/mapping; \
install -s ORF_finder ../../util/denovo
install: target '../../util/mapping' is not a directory
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/mapping'
cd src/denovo && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/denovo'
make[1]: Nothing to be done for 'all'.
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/denovo'
cd src/homology && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/homology'
g++ fastarepair.cpp -o fastarepair -std=c++0x -O3
g++ fastarepair2.cpp -o fastarepair2 -std=c++0x -O3
g++ gff_2_proteinfasta.cpp -o gff_2_proteinfasta -std=c++0x -O3
g++ flameshiftgrep.cpp -o flameshiftfilter -std=c++0x -O3
install -s fastarepair fastarepair2 gff_2_proteinfasta flameshiftfilter ../../util/homology
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/homology'
cd src/abinitio && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/abinitio'
g++ simple_low_norepeatmask.cpp -o simple_low_norepeatmask -std=c++0x -O3
g++ inframe_stopcodon_exclude.cpp -o inframe_stopcodon_exclude -std=c++0x -O3
g++ makefasta.cpp -o makefasta -std=c++0x -O3
install -s simple_low_norepeatmask inframe_stopcodon_exclude makefasta ../../util/abinitio
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/abinitio'
cd src/merge_phase0 && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase0'
g++ 190521_gff_editor.cpp -o gff_editor -std=c++0x -O3
In file included from 190521_gff_editor.cpp:33:
function.hpp: In function ‘int filter_fuc(std::string, std::string, std::unordered_map<std::__cxx11::basic_string<char>, std::__cxx11::basic_string<char> >&, int&, int&)’:
function.hpp:272:1: warning: control reaches end of non-void function [-Wreturn-type]
272 | }
| ^
g++ row2_rename.cpp -o Row2_rename -std=c++0x -O3
g++ rnaseq_reform.cpp -o RNA-seq_reform -std=c++0x -O3
g++ spaln_reform.cpp -o Spaln_reform -std=c++0x -O3
g++ augustus_reform.cpp -o Augustus_reform -std=c++0x -O3
install -s \
gff_editor Row2_rename RNA-seq_reform Spaln_reform \
Augustus_reform ../../util/merge_phase0
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase0'
cd src/merge_phase1 && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase1'
g++ grouping.cpp -o Grouping -std=c++0x -O3
g++ subgroup_v2.2.cpp -o subgroup -std=c++0x -O3
g++ new_subgroup.cpp -o new_subgroup -std=c++0x -O3
g++ searchalgo.cpp -o Searchalgo -std=c++0x -O3
g++ gff_editor.cpp -o gff_editor -std=c++0x -O3
g++ initial_exon_polish.cpp -o initial_exon_polish -std=c++0x -O3
install -s \
Grouping subgroup new_subgroup Searchalgo \
gff_editor initial_exon_polish ../../util/merge_phase1
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase1'
cd src/merge_phase2 && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase2'
g++ geneadd_v191115.cpp -o geneadd_v191115 -std=c++0x -O3
g++ geneadd_v191119.cpp -o geneadd_v191119 -std=c++0x -O3
g++ grouping_v1.cpp -o grouping_v1 -std=c++0x -O3
install -s geneadd_v191115 geneadd_v191119 grouping_v1 ../../util/merge_phase2
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/merge_phase2'
cd src/summary && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/summary'
g++ final_reform.cpp -o final_reform -std=c++0x -O3
install -s final_reform ../../util/summary
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/summary'
cd src/evaluation && make all
make[1]: Entering directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/evaluation'
g++ evaluation4.cpp -o evaluate -std=c++0x -O3
g++ preevaluation.cpp -o preevaluate -std=c++0x -O3
install -s evaluate preevaluate ../../util/evaluation
install: target '../../util/evaluation' is not a directory
make[1]: *** [Makefile:12: install] Error 1
make[1]: Leaving directory '/lustre7/home/kosukesano/tools/for_ginger/250123_test/GINGER/src/evaluation'
make: *** [Makefile:31: evaluation] Error 2
kosukesano@at137:~/tools/for_ginger/250123_test/GINGER$なんか知らないエラーが出てるけど気にしない。
この状態でもう既に素のGINGERは動く。
kosukesano@at137:~/tools/for_ginger/250123_test/GINGER$ ./runGINGER.pl --help
Usage: ./runGINGER.pl [Netflow configuration file]
--mapping Preparation phase Mapping-based method only
--denovo Preparation phase de novo-based method only
--homology Preparation phase homology-based method only
--abinitio Preparation phase ab initio-based method only
--phase0 Merge phase 0 only
--phase1 Merge phase 1 only
--phase1manual Merge phase 1 only, need the threshold of gene
--phase2 Merge phase 2 only
--totalcds Total CDS minimum length in Merge phase 2 (i.e. threshold)
--summary Merge phase summary only
--help This help message
./runGINGER.pl [configuration file for user specific settings] at ./runGINGER.pl line 83.
kosukesano@at137:~/tools/for_ginger/250123_test/GINGER$ GINGERの実行(できなかった)
kosukesano@at137:~/tools/for_ginger/250123_test/nama_data$ ls
231117_Madara_softmasked.fasta 231117_madaragenome.fasta 231117_madaragenome.fasta.out adult-1_1.fastq adult-1_2.fastq
kosukesano@at137:~/tools/for_ginger/250123_test/nama_data$ ここに生データを置いた。
~/tools/for_ginger/250123_testにnextflow.configをコピーし、編集して実行しようとしたが……。
/****************************************
Homology based
****************************************/
PDIR_PREP_HOMOLOGY = "${PDIR_PREP}/homology" // *** No need to edit ***
PDIR_PREP_HOMOLOGY_HOMOLOGY = "${PDIR_PREP_HOMOLOGY}/homology" // *** No need to edit ***
PDIR_PREP_HOMOLOGY_HOMOLOGYMERGE = "${PDIR_PREP_HOMOLOGY}/homology_merge" // *** No need to edit ***
PDIR_PREP_HOMOLOGY_HOMOLOGYFILTER = "${PDIR_PREP_HOMOLOGY}/homology_filter" // *** No need to edit ***
UTILPATH_HOMOLOGY = "${GINGER_UTIL}/homology" // *** No need to edit ***
// --- Tools for homology ---
SPALN = "/path/to/spaln" // a full path to Spaln command "spaln"
MAKEIDX = "/path/to/makeidx.pl" // a full path to Spaln command "makeidx.pl"
MAKBLK = "/path/to/makblk.pl" // a full path to Spaln command "makblk.pl"
/****************************************
Ab initio based
****************************************/
PDIR_PREP_ABINITIO = "${PDIR_PREP}/abinitio" // *** No need to edit ***
PDIR_PREP_ABINITIO_AUGUSTUS = "${PDIR_PREP_ABINITIO}/augustus" // *** No need to edit ***
PDIR_PREP_ABINITIO_SNAP = "${PDIR_PREP_ABINITIO}/snap" // *** No need to edit ***
UTILPATH_ABINITIO = "${GINGER_UTIL}/abinitio" // *** No need to edit ***
// --- Tools and options related to Augustus ---
AUGUSTUS_DIR = "/path/to/augustusSourceTree" // a full path to a directory that Augustus source tree exists
AUGUSTUS = "${AUGUSTUS_DIR}/bin/augustus" // *** No need to edit ***
ETRAINING = "${AUGUSTUS_DIR}/bin/etraining" // *** No need to edit ***
AUGUSTUS_SCRIPT_DIR = "${AUGUSTUS_DIR}/scripts" // *** No need to edit ***
AUGUSTUS_CONFIG_DIR = "${AUGUSTUS_DIR}/config" // *** No need to edit ***
AUGUSTUS_SPEC_DIR = "${AUGUSTUS_DIR}/config/species" // *** No need to edit ***
AUGUSTUS_WORK_DIR = "${PDIR}/augustus_config" // *** No need to edit ***
AUGUSTUS_SPEC = "ginger" // a directory name that stores a new trained model
// the name must be unique within "[Augutus root]/config/species/"
AUGUSTUS_TRAINING_DATA = "${PDIR_PREP_MAPPING_TOLEARN2ND}/${OPREFIX}_learn_2nd.gff3" // *** No need to edit ***
// AUGUSTUS_TRAINING_DATA is used if you run mapping.nf and abinitio.nf separately
AUGUSTUS_TRAINING_SIZE = 1000 // Number of gene structures for training
// --- Tools and options related to SNAP ---
SNAP_DIR = "/path/to/snapBinDir" // a full path to a directory that SNAP binary exists
FATHOM = "${SNAP_DIR}/fathom" // *** No need to edit ***
FORGE = "${SNAP_DIR}/forge" // *** No need to edit ***
SNAP = "${SNAP_DIR}/snap" // *** No need to edit ***依存パッケージ全部にパス通すの!?キッツイ!
「DfamのRepeatMasker用データとマダラのゲノムデータをEDTAのBuildDataBaseでデータベース化したもの結合させ、-libに指定し、DockerのRepeatMaskerでマスキングしたマダラケシツブゾウムシのデータ」を使って再度OrthoFinderをかける
1202に最新バージョンのマダラゲノムからアイソフォームを抜いてたようなので、これを使ってOrthoFinderを行う。
kosukesano@at137:~/tools/for_braker/241129_madara$ ls
241129_madara_iso1.aa busco_1153799319.log busco_4088551519.log madara_braker.sh.e27304363 madara_busco.sh.e27312420 madara_busco.sh.o27312452 madara_busco.sh.po27312451
BUSCO_OUTPUT_MADARA busco_1949637205.log busco_984679413.log madara_braker.sh.o27304363 madara_busco.sh.e27312451 madara_busco.sh.pe27312420 madara_busco.sh.po27312452
BUSCO_OUTPUT_MADARA2 busco_2089185273.log busco_downloads madara_braker.sh.pe27304363 madara_busco.sh.e27312452 madara_busco.sh.pe27312451
ExIsoform.py busco_269980639.log iso1_busco madara_braker.sh.po27304363 madara_busco.sh.o27312420 madara_busco.sh.pe27312452
braker busco_3215571167.log madara_braker.sh madara_busco.sh madara_busco.sh.o27312451 madara_busco.sh.po27312420
kosukesano@at137:~/tools/for_braker/241129_madara$241129_madara_iso1.aaがアイソフォーム抜いたやつ
続いて、OrthoFinder用のディレクトリを作成し、マダラのゲノムをコピーする
kosukesano@at137:~/tools/for_orthofinder$ mkdir 250123_6sp_iso1
kosukesano@at137:~/tools/for_orthofinder$ cd 250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ~/tools/for_braker/241129_madara/241129_madara_iso1.aa Smad_iso1.faa
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ ls
Smad_iso1.faa
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ 続いて、マダラゲノムデータのヘッダー行を書き換える。edit.pyを使用
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ python edit.py
../250123_6sp_iso1/Smad_iso1.faa に保存しました。
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ ls
Smad_iso1.faa edit.py
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ また、昔作ったアイソフォーム1つのファイルが241115_6sp_isoにあるので、全部コピーする。
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Cass_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Sory_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Dpon_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Agra_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ cp ../241115_6sp_iso/Tcas_iso1.faa ../250123_6sp_iso1/
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$ ls
Agra_iso1.faa Cass_iso1.faa Dpon_iso1.faa Smad_iso1.faa Sory_iso1.faa Tcas_iso1.faa edit.py
kosukesano@at137:~/tools/for_orthofinder/250123_6sp_iso1$これらを使ってOrthoFinderを実行。以下のスクリプトをqsubで投げた。
### orthofinder_250123.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l gpu
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
-f /home/kosukesano/tools/for_orthofinder/250123_6sp_iso1\
-t 16
echo end at
date今後はこれを使ってCAFEなりPAMLなりを行なっていく。
0124
OrthoFinder結果
kosukesano@at138:~$ cd tools/for_orthofinder/250123_6sp_iso1/OrthoFinder/Results_Jan23_1/
kosukesano@at138:~/tools/for_orthofinder/250123_6sp_iso1/OrthoFinder/Results_Jan23_1$ ls
Citation.txt Gene_Trees Orthogroups Phylogenetically_Misplaced_Genes Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics Log.txt Orthologues Putative_Xenologs Species_Tree
Gene_Duplication_Events Orthogroup_Sequences Phylogenetic_Hierarchical_Orthogroups Resolved_Gene_Trees WorkingDirectory
kosukesano@at138:~/tools/for_orthofinder/250123_6sp_iso1/OrthoFinder/Results_Jan23_1$OK、できてる
フェモラータのソフトマスク続き
RepeatModelerが終わった。
kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ ls RM_1746481.ThuJan231122422025/
consensi.fa consensi.fa.classified families-classified.stk families.stk round-1 round-2 round-3 round-4 round-5 round-6 tmpConsensi.fa
kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$最終出力のconsensi.fa.classifiedもできてる!
これをDfamのRepeatMasker用データセットとマージする。
kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ cat ../nama_data/Dfam_RepeatMasker_lib.fasta RM_1746481.ThuJan231122422025/consensi.fa.classified > Sfem_merged.fasta
kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ ls
RM_1746481.ThuJan231122422025 Sfem_RepeatModeler.sh.e27434697 Sfem_RepeatModeler.sh.pe27434697 Sfem_denovo_DB-families.stk Sfem_denovo_DB.nnd Sfem_denovo_DB.translation
RM_2307827.WedJan221537242025 Sfem_RepeatModeler.sh.o27433478 Sfem_RepeatModeler.sh.po27433478 Sfem_denovo_DB.nhr Sfem_denovo_DB.nni Sfem_merged.fasta
Sfem_RepeatModeler.sh Sfem_RepeatModeler.sh.o27434697 Sfem_RepeatModeler.sh.po27434697 Sfem_denovo_DB.nin Sfem_denovo_DB.nog
Sfem_RepeatModeler.sh.e27433478 Sfem_RepeatModeler.sh.pe27433478 Sfem_denovo_DB-families.fa Sfem_denovo_DB.njs Sfem_denovo_DB.nsq
kosukesano@at138:~/tools/for_softmask/250121_Sfem_softmask$ これを使ってRepeatMaskerをかける。
### Sfem_RepeatMasker.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
RepeatMasker\
-pa 6\
-s\
-lib /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/Sfem_merged.fasta\
-dir /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/output_dir\
-xsmall\
-gff\
/home/kosukesano/tools/for_softmask/nama_data/250121_Sfem_upper1000.fasta
echo end at
dateqsubで実行した。
GINGERの実行
tools_GINGER_v1.0.1.sif、apptainerで落としてきたファイルを起動してapptainerの対話コンソールに入る。
次に、git cloneで持ってきたファイル群GINGER/にPATHを通す。
ginger実行
kosukesano@at138:~/tools/for_ginger$ ls
250123_test generateSampleData_cel.pl sample tools_GINGER_v1.0.1.sif util workspace
kosukesano@at138:~/tools/for_ginger$ ls 250123_test/
GINGER nama_data nextflow.config
kosukesano@at138:~/tools/for_ginger$ ls 250123_test/GINGER/
AUTHORS CHANGES ChangeLog FAQ INSTALL LICENSE Makefile README VERSION generateSampleData_cel.pl nextflow.config.user pipeline runEvaluatePred.pl runGINGER.pl src util
kosukesano@at138:~/tools/for_ginger$ apptainer shell tools_GINGER_v1.0.1.sif
Apptainer> export PATH=$PATH:/home/kosukesano/tools/for_ginger/250123_test/GINGER
Apptainer> runGINGER.pl nextflow.config.user
No configuration file for user specific settings.
(/home/kosukesano/tools/for_ginger/nextflow.config.user) at /home/kosukesano/tools/for_ginger/250123_test/GINGER/runGINGER.pl line 88.
Apptainer> exit
exit
kosukesano@at138:~/tools/for_ginger$ less 250123_test/GINGER/nextflow.config.user
kosukesano@at138:~/tools/for_ginger$これでnextflow.config.userをちゃんと整えればワンチャンいけるかもしれへん
0127
コフキゾウムシのソフトマスク続き
kosukesano@at139:~/tools/for_softmask/250121_Kohuki_softmask$ ls RM_106086.ThuJan231122322025/
consensi.fa consensi.fa.classified families-classified.stk families.stk round-1 round-2 round-3 round-4 round-5 round-6 tmpConsensi.fa
kosukesano@at139:~/tools/for_softmask/250121_Kohuki_softmask$ できてそう。
生データを移動
kosukesano@at139:~/tools/for_softmask/250121_Kohuki_softmask$ cp ../kohuki_softmask/250121_kohuki_upper1000.fasta ../nama_data/
kosukesano@at139:~/tools/for_softmask/250121_Kohuki_softmask$ RepeatMaskerを実行。
### Kohuki_RepeatMasker_250127.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 24
#$ -l s_vmem=12G
#$ -l mem_req=12G
echo start at
date
apptainer exec /home/kosukesano/tools/for_RepeatMasker_Docker/dfam-tetools_1.sif\
RepeatMasker\
-pa 6\
-s\
-lib /home/kosukesano/tools/for_softmask/250121_Kohuki_softmask/Kohuki_merged.fasta\
-dir /home/kosukesano/tools/for_softmask/250121_Kohuki_softmask/output_dir\
-xsmall\
-gff\
/home/kosukesano/tools/for_softmask/nama_data/250121_kohuki_upper1000.fasta
echo end at
dateこれをqsubで投げた
フェモラータのRepeatMasker結果
kosukesano@at139:~/tools/for_softmask/250121_Sfem_softmask/output_dir$ ls
250121_Sfem_upper1000.fasta.cat.gz 250121_Sfem_upper1000.fasta.masked 250121_Sfem_upper1000.fasta.out 250121_Sfem_upper1000.fasta.out.gff 250121_Sfem_upper1000.fasta.tbl
kosukesano@at139:~/tools/for_softmask/250121_Sfem_softmask/output_dir$できてそう。これを一応BUSCOかけておく。
フェモラータのソフトマスク後ゲノムでのBUSCO
### Sfem_G_BUSCO_250127.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 12
echo start at
date
date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
-m geno\
-i /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/output_dir/250121_Sfem_upper1000.fasta.masked\
-o /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/output_dir/BUSCO_output_Sfem_genome\
-l\
/home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
-f
echo end at
dateメモリ少ないと途中で解析止まっちゃうよ!
結果はこう
### ~/tools/for_softmask/250121_Sfem_softmask/output_dirBUSCO_output_Sfem_genome/run_arthropoda_odb10/short_summary.txt
# BUSCO version is: 5.1.3
# The lineage dataset is: arthropoda_odb10 (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_softmask/250121_Sfem_softmask/output_dir/250121_Sfem_upper1000.fasta.masked
# BUSCO was run in mode: genome
# Gene predictor used: metaeuk
***** Results: *****
C:98.0%[S:97.1%,D:0.9%],F:0.6%,M:1.4%,n:1013
993 Complete BUSCOs (C)
984 Complete and single-copy BUSCOs (S)
9 Complete and duplicated BUSCOs (D)
6 Fragmented BUSCOs (F)
14 Missing BUSCOs (M)
1013 Total BUSCO groups searched
Dependencies and versions:
hmmsearch: 3.1
metaeuk: 4.a0f584dkosukesano@at137:~/tools/for_softmask/nama_data$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat 250121_Sfem_upper1000.fasta
file format type num_seqs sum_len min_len avg_len max_len
250121_Sfem_upper1000.fasta FASTA DNA 4,530 495,258,564 1,001 109,328.6 7,760,786
kosukesano@at137:~/tools/for_softmask/nama_data$フェモラータのBRAKER3
kosukesano@at137:~/tools/for_braker/nama_data$ mv ~/tools/for_softmask/250121_Sfem_softmask/output_dir/250121_Sfem_upper1000.fasta.masked 250127_Sfem_upper1000_masked.fasta
kosukesano@at137:~/tools/for_braker/nama_data$ ls
231117_Madara_softmasked.fasta Dfro.fna Pst_NotUseEDTA_upper5000.fna kohuki_busco.sh.e26238968
241017_Ojiro_masked.fa Dval.fna Sfem_RNAseq kohuki_busco.sh.o26238968
241120_madara_dfam.fasta Ekam_NotUseEDTA.fna Sfem_pilon_softmasked.fasta kohuki_busco.sh.pe26238968
241127_madara_DockerRM.fasta Ekam_oomoji.fna Sfem_softmasked.fasta kohuki_busco.sh.po26238968
241127_madara_dfam_RM_data_NotUsedBuildDB.fasta Elaeidobius_kamerunicus.masked.fna busco_downloads kohuki_softmasked.fasta
241127_madara_dfam_RMdata_buildDB.fasta GCA_014849505.1_AAL_Ekam_1.0_genomic.fna.masked.gz femo_busco.sh kohuki_softmasked_upper1000.fasta
241129_madara_dfamplusbuilddb.fasta Madara_RNAseq femo_busco.sh.e26221930 length.txt
241226_ojiro_masked.fa Ojiro_RNAseq femo_busco.sh.o26221930 madaralength.txt
250127_Sfem_upper1000_masked.fasta Pst_NotUseEDTA.fna femo_busco.sh.pe26221930
BUSCO_OUTPUT_FEMO_GENOME Pst_NotUseEDTA_upper1000.fna femo_busco.sh.po26221930
BUSCO_OUTPUT_KOHUKI_GENOME Pst_NotUseEDTA_upper10000.fna kohuki_busco.sh
kosukesano@at137:~/tools/for_braker/nama_data$ ### Sfem_braker_250127.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/250127_Sfem_upper1000_masked.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--rnaseq_sets_ids=Sfem-1_1,femo-larva_1,femo_H1_1,femo_H3_1,femo_L1_1,femo_L3_1,femo_O1_1,femo_O3_1,femo_T1_1,femo_T3_1,Sfem-1_2,femo-larva_2,femo_H1_2,femo_H3_2,femo_L1_2,femo_L3_2,femo_O1_2,femo_O3_2,femo_T1_2,femo_T3_2,femo-female_1,femo-male_1,femo_H2_1,femo_H4_1,femo_L2_1,femo_L4_1,femo_O2_1,femo_O4_1,femo_T2_1,femo_T4_1,femo-female_2,femo-male_2,femo_H2_2,femo_H4_2,femo_L2_2,femo_L4_2,femo_O2_2,femo_O4_2,femo_T2_2,femo_T4_2 \
--rnaseq_sets_dir=/home/kosukesano/tools/for_braker/nama_data/Sfem_RNAseq\
--threads=16\
--species=250127_Sfemorata\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
echo end at
dateGINGERの実行
~/tools/for_ginger/250123_testで250127_nextflow.config.userを作成し、ここで実行しようとしたが以下のエラーが発生。
kosukesano@at137:~/tools/for_ginger/250123_test$ apptainer shell ../tools_GINGER_v1.0.1.sif
Apptainer> export PATH=$PATH:/home/kosukesano/tools/for_ginger/250123_test/GINGER
Apptainer> runGINGER.pl /home/kosukesano/tools/for_ginger/250123_test/250127_nextflow.config.user
cp: '/home/kosukesano/tools/for_ginger/250123_test/250127_nextflow.config.user' and '/home/kosukesano/tools/for_ginger/250123_test/250127_nextflow.config.user' are the same file
Apptainer> これは遺伝研特有のコピーディレクトリが悪さをしているのでは?
いやいやスクリプトの中身でcpコマンドがあって、こいつが悪さをしているっぽい
0128
Orthofinderのインプットに使ってた種、間違えていないか?
### ~/tools/for_orthofinder/250123_6sp_iso1/OrthoFinder/Results_Jan23_1/Orthogroups/Orthogroups.txtの中身
OG0000000: Agra_P_050299707.1 Cfor_P_060516509.1 Cfor_P_060517001.1 Cfor_P_060518228.1 Cfor_P_060518420.1 Cfor_P_060519135.1 Cfor_P_060519160.1 Cfor_P_060519161.1 Cfor_P_060520792.1 Cfor_P_060527544.1 Cfor_P_060528287.1 Cfor_P_060531558.1 Cfor_P_060531998.1 Cfor_P_060533842.1 Cfor_P_060534539.1 Cfor_P_060534541.1 Cfor_P_060534544.1 Cfor_P_060534546.1 Cfor_P_060535885.1 Cfor_P_060537097.1 Dpon_P_019755307.1 Dpon_P_019762611.2 Dpon_P_019772941.2 Dpon_P_048518566.1 Dpon_P_048519274.1 Dpon_P_048521352.1 Dpon_P_048521747.1 Dpon_P_048523240.1 Smad_g7893.t1 Sory_P_030746543.1 Sory_P_030746551.1 Sory_P_030746552.1 Sory_P_030746582.1 Sory_P_030746655.1 Sory_P_030747073.1 Sory_P_030747074.1 Sory_P_030747075.1 Sory_P_030747801.1 Sory_P_030747802.1 Sory_P_030748222.1 Sory_P_030749813.1 Sory_P_030749814.1 Sory_P_030750813.1 Sory_P_030751507.1 Sory_P_030751509.1 Sory_P_030751544.1 Sory_P_030751614.1 Sory_P_030751739.1 Sory_P_030751799.1 Sory_P_030752348.1 Sory_P_030752696.1 Sory_P_030753012.1 Sory_P_030753074.1 Sory_P_030754214.1 Sory_P_030756675.1 Sory_P_030756676.1 Sory_P_030757554.1 Sory_P_030758072.1 Sory_P_030758322.1 Sory_P_030758536.1 Sory_P_030760004.1 Sory_P_030760368.1 Sory_P_030760810.1 Sory_P_030760811.1 Sory_P_030761017.1 Sory_P_030761914.1 Sory_P_030762002.1 Sory_P_030762174.1 Sory_P_030762175.1 Sory_P_030762745.1 Sory_P_030762746.1 Sory_P_030763129.1 Sory_P_030764236.1 Sory_P_030764263.1 Sory_P_030764314.1 Sory_P_030764610.1 Sory_P_030764983.1 Sory_P_030765172.1 Sory_P_030765480.1 Sory_P_030765760.1 Sory_P_030767016.1 Sory_P_030767532.1 Sory_P_030767905.1Cfor_P_ってなんだ!?
元は1115のorthofinder、さらにそのインプット元は/home/kosukesano/tools/for_isoform_ex/output_dataだったはず。
### /home/kosukesano/tools/for_isoform_ex/output_data/Cass_iso1.faaの中身
>XP_060524352.1 natterin-3-like [Cylas formicarius]
MAAYYWVDTVARRRVPSTALRGGTDVDGQPIYVGRAFHEGDWIPAKVIPGKQVAYVAYGG
REIPKSQFQVLCEQQFDWVPSRHGHVPPDAVIGGKTSSGENLYIGRVRHRGSHTVGKVHP
SHKCCYIPFDGKEVPHQDYEILVLRG
>XP_060531338.1 uncharacterized protein LOC132704961 [Cylas formicarius]
MRDVAKGTSRQSQRGMSPNSDQSYFERLCPLPYGCACQTTPKGRRTGPCRPRNLDGFLRT
YGFIVTNGSHPVLRTIDAIPKGAGLRLSTLARATKRATRSGKTTRASAKALAGLQ
>XP_060531951.1 uncharacterized protein LOC132705400 isoform X1 [Cylas formicarius]
MYTQTIGWCLFGLLFSGTVLTTLAYPNSQPMPSYRPIRGAPPTLQQVNSVEQMHQERERK
FAEKPNAIKKVALDDLDNVQTNQISESAGGGFSWSNLLGTSYLTPLVNGMLMQMIFNPGG
GVPTGPNKSEGLDDGGVAPSPWANLITMGLKILSAILGGGAAAQNEGIDKVDNGGGSPLQ
GVLAAVVSTMVGGRDPQQVNMLAKQAGEFINIVVNLLDALKTSFSHRSLAARNLGRKDSV
SDAAIAGISMMKGYAKSLGTDESNCMARYMCQANNECSTDIGQSSLFCHLGSYAASFVLD
KATASTTFDLLYEAGRRGRSGDNCQQAYLECNEV違う種じゃんか〜!!!!
6種のOrthoFinderやり直し
まずディレクトリを作成してCassの生データを移動。
kosukesano@at138:~/tools/for_orthofinder$ mkdir 250128_6sp_iso1
kosukesano@at138:~/tools/for_orthofinder$ cp Smad_Agra_Cass_Dpon_Sory_Tcas_fasta_dir/Cass.fasta 250128_6sp_iso1/
kosukesano@at138:~/tools/for_orthofinder$ cd 250128_6sp_iso1/
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass.fasta
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ cp genomic.gff ~/tools/for_orthofinder/250128_6sp_iso1/Cass.gff
kosukesano@at138:~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1$ cd ~/tools/for_orthofinder/250128_6sp_iso1/
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass.fasta Cass.gff
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ mkdir nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass.fasta Cass.gff nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ mv Cass.* nama_data/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls nama_data/
Cass.fasta Cass.gff
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$faspを使用してアイソフォームを除去
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ source ~/tools/for_isoform_ex/fasp/bin/activate
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ python3 -m fasp exclude_isoforms_by_length nama_data/Cass.fasta Cass_iso1.fasta nama_data/Cass.gff
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass_iso1.fasta nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ヘッダーの書き換え
このedit.pyを実行した。
### ~/tools/for_orthofinder/250128_6sp_iso1/edit.py
import os
from Bio import SeqIO
# 入力ディレクトリと出力ディレクトリのパス
input_dir = '/home/kosukesano/tools/for_orthofinder/250128_6sp_iso1/'
output_dir = '../250128_6sp_iso1/Change_hedder/'
# 出力ディレクトリが存在しない場合は作成
if not os.path.exists(output_dir):
os.makedirs(output_dir)
# 入力ディレクトリ内のすべての .faa または .aa ファイルを処理
for input_file in os.listdir(input_dir):
if input_file.endswith(('.faa', '.aa', '.fasta')):
input_path = os.path.join(input_dir, input_file)
output_path = os.path.join(output_dir, input_file)
# ファイル形式を設定
format_type = 'fasta' # Biopython では .faa も .aa も "fasta" 形式として扱う
with open(output_path, 'w') as outfile:
for record in SeqIO.parse(input_path, format_type):
header = record.description
seq = str(record.seq)
new_header = "" # 初期化
# ヘッダーが「g」で始まる場合
if header.startswith("g"):
number = header.split()[0] # ヘッダーの最初の番号部分を取得
new_header = f">Smad_{number}"
# ヘッダーが「]」で終わる場合
elif header.endswith("]"):
within_brackets = header.split('[')[-1].split(']')[0]
first_letter = within_brackets[0] # 最初の1文字
space_after = within_brackets.split()[-1][:3] # スペース後の3文字
first_part = header.split()[0][1:]
new_header = f">{first_letter}{space_after}_{first_part}"
# それ以外
else:
new_header = f">{header.split()[0]}"
# 新しいヘッダーと配列を出力ファイルに書き込む
outfile.write(f"{new_header}\n{seq}\n")
print(f"{output_path} に保存しました。")インプットを.fastaにも対応させてる。
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ python edit.py
../250128_6sp_iso1/Change_hedder/Cass_iso1.fasta に保存しました。
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls
Cass_iso1.fasta Change_hedder edit.py nama_data
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls Change_hedder/
Cass_iso1.fasta
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$他のデータをコピーしてくる
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Agra_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Dpon_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Sory_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Smad_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cp ../250123_6sp_iso1/Tcas_iso1.faa Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ cd Change_hedder/
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder$ mv Cass_iso1.fasta Cass_iso1.faa
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder$ ls
Agra_iso1.faa Cass_iso1.faa Dpon_iso1.faa Smad_iso1.faa Sory_iso1.faa Tcas_iso1.faa
(fasp) kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder$ 続いて~/tools/for_orthofinder/250128_6sp_iso1/でorthofinder_250128.shを作成、実行した。
### orthofinder_250128.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -pe def_slot 16
#$ -l gpu
echo start at
date
singularity exec /usr/local/biotools/o/orthofinder:2.5.4--hdfd78af_0 orthofinder\
-f /home/kosukesano/tools/for_orthofinder/250128_6sp_iso1/Change_hedder\
-t 16
echo end at
dateできてそう
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ls Change_hedder/OrthoFinder/Results_Jan28/
Citation.txt Gene_Trees Orthogroups Phylogenetically_Misplaced_Genes Single_Copy_Orthologue_Sequences
Comparative_Genomics_Statistics Log.txt Orthologues Putative_Xenologs Species_Tree
Gene_Duplication_Events Orthogroup_Sequences Phylogenetic_Hierarchical_Orthogroups Resolved_Gene_Trees WorkingDirectory
kosukesano@at138:~/tools/for_orthofinder/250128_6sp_iso1$ ローカル環境でのGINGERインストール
遺伝研での実行を諦め、ローカルでDockerを使うことにした。
まずDocker Desktopをインストールしておく。 その後、以下のコマンドを実行。
(base) :~/bio/for_ginger$ docker pull i10labtitech/tools:GINGER_v1.0.1
GINGER_v1.0.1: Pulling from i10labtitech/tools
482bd95e477d: Download complete
1bc677758ad7: Download complete
Digest: sha256:8f6de2fc83d99a8df64fcc82cddad1bdca6e0d4175757e629a8ff7da6f106421
Status: Downloaded newer image for i10labtitech/tools:GINGER_v1.0.1
docker.io/i10labtitech/tools:GINGER_v1.0.1
(base) :~/bio/for_ginger$ (base) :~/bio/for_ginger$ docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
hello-world latest d715f14f9eca 6 days ago 17kB
dfam/tetools latest f60775010b4d 4 months ago 4.18GB
i10labtitech/tools GINGER_v1.0.1 8f6de2fc83d9 20 months ago 23.6GB
(base) :~/bio/for_ginger$ちゃんと入ってそう。
docker run -t -i i10labtitech/tools:GINGER_v1.0.1 /bin/bashで実行
(base) :~/bio/for_ginger$ docker run -t -i i10labtitech/tools:GINGER_v1.0.1 /bin/bash
WARNING: The requested image's platform (linux/amd64) does not match the detected host platform (linux/arm64/v8) and no specific platform was requested
(base) root@f9f7fcf458b4:/# pwd
/
(base) root@f9f7fcf458b4:/# ls
GINGER_v1.0.1 bin boot data1 data2 dev etc home lib lib32 lib64 libx32 media mnt nextflow opt proc root run sbin scratch srv sys tmp usr var
(base) root@f9f7fcf458b4:/#0129
フェモラータのBRAKER
kosukesano@at138:~/tools/for_braker/250127_Sfem$ ls braker/
Augustus GeneMark-ETP braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff species what-to-cite.txt
kosukesano@at138:~/tools/for_braker/250127_Sfem$
kosukesano@at138:~/tools/for_braker/250127_Sfem/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 14,515 6,313,363 2 435 6,295
kosukesano@at138:~/tools/for_braker/250127_Sfem/braker$ できてそうではある
BRAKERでアノテーションつけたフェモラータのBUSCO
これを`qsubで投げた
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 12
echo start at
date
date
singularity exec -e /usr/local/biotools/b/busco:5.1.3--pyhdfd78af_0 busco\
-m protein\
-i /home/kosukesano/tools/for_braker/250127_Sfem/braker/braker.aa\
-o BUSCO_output\
-l /home/kosukesano/old_envilonment_until20240430/busco_downloads/busco_downloads/lineages/arthropoda_odb10/\
-f
echo end at
date-oのオプションはフルパス通すとエラーになるので注意!
結果がこう
# BUSCO version is: 5.1.3
# The lineage dataset is: (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_braker/250127_Sfem/braker/braker.aa
# BUSCO was run in mode: proteins
***** Results: *****
C:73.4%[S:56.8%,D:16.6%],F:5.0%,M:21.6%,n:1013
743 Complete BUSCOs (C)
575 Complete and single-copy BUSCOs (S)
168 Complete and duplicated BUSCOs (D)
51 Fragmented BUSCOs (F)
219 Missing BUSCOs (M)
1013 Total BUSCO groups searched
Dependencies and versions:
hmmsearch: 3.1うーん……。ゲノムの時はいい感じだったから、BRAKERの設定が悪いのかなあ。
ローカルのファイルをDockerコンテナで使う練習
テスト用にpythonの環境を立てる
(base) :~/bio/for_ginger$ docker pull python
Using default tag: latest
latest: Pulling from library/python
4cf0e15c283e: Download complete
e474a4a4cbbf: Download complete
94c5996c7a64: Download complete
133055fd9ad7: Download complete
936252136b92: Download complete
00fcba8cde0d: Download complete
d22b85d68f8a: Download complete
Digest: sha256:137ae4b9f85671bd912a82a19b6966e2655f73e13579b5d6ad4edbddaaf62a9c
Status: Downloaded newer image for python:latest
docker.io/library/python:latest
(base) :~/bio/for_ginger$ docker images
REPOSITORY TAG IMAGE ID CREATED SIZE
hello-world latest d715f14f9eca 7 days ago 17kB
python latest 137ae4b9f856 11 days ago 1.47GB
dfam/tetools latest f60775010b4d 4 months ago 4.18GB
i10labtitech/tools GINGER_v1.0.1 8f6de2fc83d9 20 months ago 23.6GB
(base) :~/bio/for_ginger$ docker container run -it python
Python 3.13.1 (main, Jan 24 2025, 20:47:48) [GCC 12.2.0] on linux
Type "help", "copyright", "credits" or "license" for more information.
>>> exit(base) :~/bio/for_ginger$ docker ps -a
CONTAINER ID IMAGE COMMAND CREATED STATUS PORTS NAMES
d3c9a7023050 python "python3" About a minute ago Exited (0) 13 seconds ago mystifying_goldberg
c680624159e6 dfam/tetools:latest "/bin/bash" 15 minutes ago Exited (0) 2 minutes ago infallible_lehmann
f9f7fcf458b4 i10labtitech/tools:GINGER_v1.0.1 "/bin/bash" 21 hours ago Exited (0) 18 minutes ago strange_chaum
7d2bf9d26f8e i10labtitech/tools:GINGER_v1.0.1 "/bin/bash" 21 hours ago Exited (0) 21 hours ago distracted_shockley
8481b2955c5a i10labtitech/tools:GINGER_v1.0.1 "/bin/bash" 21 hours ago Exited (0) 21 hours ago sweet_lederberg
04727df21c92 hello-world "/hello" 21 hours ago Exited (0) 21 hours ago stupefied_lovelace
73f896bc2cd9 dfam/tetools "bash" 2 months ago Exited (255) 21 hours ago dfamtet
(base) :~/bio/for_ginger$ (base) :~/bio/for_ginger$ docker container restart d3c9a7023050
d3c9a7023050
(base) :~/bio/for_ginger$ docker exec -it charming_mirzakhani bash
Error response from daemon: No such container: charming_mirzakhani
(base) :~/bio/for_ginger$ docker exec -it python bash
Error response from daemon: No such container: python
(base) :~/bio/for_ginger$ docker exec -it d3c9a7023050 bash
root@d3c9a7023050:/# pwd
/
root@d3c9a7023050:/# ls
bin boot dev etc home lib media memo.txt mnt opt proc root run sbin srv sys tmp usr var
root@d3c9a7023050:/# less memo.txt
bash: less: command not found
root@d3c9a7023050:/#root@d3c9a7023050:/# cat /etc/issue
Debian GNU/Linux 12 \n \l
root@d3c9a7023050:/# apt-get update
Get:1 http://deb.debian.org/debian bookworm InRelease [151 kB]
Get:2 http://deb.debian.org/debian bookworm-updates InRelease [55.4 kB]
Get:3 http://deb.debian.org/debian-security bookworm-security InRelease [48.0 kB]
Get:4 http://deb.debian.org/debian bookworm/main arm64 Packages [8693 kB]
Get:5 http://deb.debian.org/debian bookworm-updates/main arm64 Packages [13.3 kB]
Get:6 http://deb.debian.org/debian-security bookworm-security/main arm64 Packages [239 kB]
Fetched 9199 kB in 1s (9432 kB/s)
Reading package lists... Done
root@d3c9a7023050:/# apt-get install less -y
Reading package lists... Done
Building dependency tree... Done
Reading state information... Done
The following NEW packages will be installed:
less
0 upgraded, 1 newly installed, 0 to remove and 5 not upgraded.
Need to get 128 kB of archives.
After this operation, 434 kB of additional disk space will be used.
Get:1 http://deb.debian.org/debian bookworm/main arm64 less arm64 590-2.1~deb12u2 [128 kB]
Fetched 128 kB in 0s (2188 kB/s)
debconf: delaying package configuration, since apt-utils is not installed
Selecting previously unselected package less.
(Reading database ... 23992 files and directories currently installed.)
Preparing to unpack .../less_590-2.1~deb12u2_arm64.deb ...
Unpacking less (590-2.1~deb12u2) ...
Setting up less (590-2.1~deb12u2) ...
root@d3c9a7023050:/# less memo.txt
"memo.txt" may be a binary file. See it anyway?
root@d3c9a7023050:/# なんかバイナリ扱いされてんだが?
GINGERサンプルデータの取得
Dockerが起動している状態で(?)、ローカルの作業ノードで
perl generateSampleData_cel.py sample(base) :~/bio/for_ginger/test/GINGER$ ls
AUTHORS FAQ Makefile generateSampleData_cel.pl runEvaluatePred.pl src
CHANGES INSTALL README nextflow.config.user runGINGER.pl util
ChangeLog LICENSE VERSION pipeline sample
(base) :~/bio/for_ginger/test/GINGER$ ls sample/
GCA_000180635.4_El_Paco_v._4_translated_cds.faa GCF_000002985.6_WBcel235_genomic.out
GCF_000002985.6_WBcel235_genomic.commentModified.fna GCF_000004555.2_CB4_translated_cds.faa
GCF_000002985.6_WBcel235_genomic.commentModified.masked.fna SRR5849934_1.fastq
GCF_000002985.6_WBcel235_genomic.gff SRR5849934_2.fastq
(base) :~/bio/for_ginger/test/GINGER$ サンプルデータを用いたGINGER
### 250129_test_output/summary.stderrの中身
WARNING: The requested image's platform (linux/amd64) does not match the detected host platform (linux/arm64/v8) and no specific platform was requested
docker: Error response from daemon: Mounts denied:
The path /scratch is not shared from the host and is not known to Docker.
You can configure shared paths from Docker -> Preferences... -> Resources -> File Sharing.
See https://docs.docker.com/desktop/settings/mac/#file-sharing for more info.0130
コフキのソフトマスク結果
kosukesano@at137:~/tools/for_softmask/250121_Kohuki_softmask$ ls output_dir/
kosukesano@at137:~/tools/for_softmask/250121_Kohuki_softmask$結果が出力されていないんだが?
..................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................
(END)途中で止まっちゃったっぽい。なぜえ。
6種のゲノムデータを用いた種系統樹推定
- 1:
Manualphylo_dataディレクトリの作成
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28$ mkdir Manualphylo_data
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28$- 2:
Manualphylo_1.pyの実行
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$ python Manualphylo_1.py
Orthogroup Agra_iso1 Cass_iso1 Dpon_iso1 Smad_iso1 Sory_iso1 Tcas_iso1
3316 OG0003316 Agra_P_050297705.1 Cass_AG9763147.1 Dpon_P_019756877.1 Smad_g5919.t1 Sory_P_030749172.1 Tcas_P_008190965.1
3318 OG0003318 Agra_P_050313709.1 Cass_AG9759263.1 Dpon_P_019760448.1 Smad_g2942.t1 Sory_P_030746210.1 Tcas_P_008193499.1
3319 OG0003319 Agra_P_050310562.1 Cass_AG9760850.1 Dpon_P_019762920.2 Smad_g874.t1 Sory_P_030761663.1 Tcas_P_008197831.1
3320 OG0003320 Agra_P_050302120.1 Cass_AG9772342.1 Dpon_P_019761695.1 Smad_g4673.t1 Sory_P_030759635.1 Tcas_P_966819.1
3321 OG0003321 Agra_P_050298809.1 Cass_AG9761965.1 Dpon_P_019754246.2 Smad_g9255.t2 Sory_P_030750748.1 Tcas_P_015834610.1
... ... ... ... ... ... ... ...
8556 OG0008556 Agra_P_050313166.1 Cass_AG9759544.1 Dpon_P_019753580.1 Smad_g11268.t1 Sory_P_030756213.1 Tcas_P_015834054.1
8557 OG0008557 Agra_P_050299456.1 Cass_AG9767641.1 Dpon_P_019768109.2 Smad_g10715.t1 Sory_P_030747660.1 Tcas_P_008194715.1
8558 OG0008558 Agra_P_050296789.1 Cass_AH1126441.1 Dpon_P_019767524.1 Smad_g3535.t1 Sory_P_030750133.1 Tcas_P_008195711.1
8559 OG0008559 Agra_P_050308325.1 Cass_AG9761889.1 Dpon_P_048517173.1 Smad_g9800.t1 Sory_P_030761000.1 Tcas_P_968816.1
8560 OG0008560 Agra_P_050308491.1 Cass_AH1131527.1 Dpon_P_048526405.1 Smad_g6531.t1 Sory_P_030748824.1 Tcas_P_001811794.1
[4830 rows x 7 columns]
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$ ls
Manualphylo_1.py OG_list.txt species_list.txt
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$- 3:
all_seq.faの作成
fasta_concatinate.shを実行する
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$ sh fasta_concatinate.sh
start at
Thu Jan 30 15:37:27 JST 2025
Thu Jan 30 15:37:29 JST 2025
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$ ls
Manualphylo_1.py OG_list.txt all_seq.fa fasta_concatinate.sh species_list.txt
kosukesano@at137:~/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Manualphylo_data$4:
Manualphylo_2.pyの実行
SCOのOG番号ごとにファイルができる、時間かかる5:
align.shの実行
SCOのファイルがMAFFTによりアライメントされる,時間かかる6:
makealltreeの実行
2025年2月
0203
makealltree.sh出力のファイルを元にしたASTRALの実行
~/tools/for_ASTRAL/Astral/data/250203_6spにall_trees.nwkをコピー。
### all_trees.nwkの中身
OG0003316: (Agra_P_050297705.1:0.1651293024,(Cass_AG9763147.1:0.1818482204,(Sory_P_030749172.1:0.1209868450,Tcas_P_008190965.1:0.9786646109)49:0.1585813214)37:0.0878202062,(Dpon_P_019756877.1:0.1886146557,Smad_g5919.t1:0.3120157875)25:0.0602110128);
OG0003318: (Agra_P_050313709.1:0.0315659308,(Cass_AG9759263.1:0.0142661688,Sory_P_030746210.1:0.0106140127)72:0.0047846194,((Dpon_P_019760448.1:0.0204733482,Tcas_P_008193499.1:0.0623490402)46:0.0013761600,Smad_g2942.t1:0.0000010000)41:0.0000010000);
OG0003319: (Agra_P_050310562.1:0.1472894568,((Cass_AG9760850.1:0.1222426475,Smad_g874.t1:0.1761863246)64:0.0413255253,Sory_P_030761663.1:0.1262376730)58:0.0310103463,(Dpon_P_019762920.2:0.1504657030,Tcas_P_008197831.1:0.3955563338)53:0.0738820816);
OG0003320: (Agra_P_050302120.1:0.2407713260,(Cass_AG9772342.1:0.0426387253,Dpon_P_019761695.1:0.2475477632)61:0.0326335634,(Smad_g4673.t1:0.1131079176,(Sory_P_030759635.1:0.2653371088,Tcas_P_966819.1:0.5311370910)99:0.1490033020)61:0.0232625701);
OG0003321: (Agra_P_050298809.1:0.4846044357,(Cass_AG9761965.1:0.6729211459,(Dpon_P_019754246.2:0.3812087652,Smad_g9255.t2:0.1754803069)72:0.0580715184)51:0.0626765693,(Sory_P_030750748.1:0.4116607756,Tcas_P_015834610.1:1.1104186828)35:0.0790941709);
.
.
.
.
.
.OG番号や遺伝子IDなどの無駄な情報があるので、それらを削除するスクリプトを用意した。
### modify.pyの中身
import re
# 元のファイルと新しいファイルのパスを設定
input_file_path = 'all_trees.nwk'
output_file_path = 'modified_trees.nwk'
# 処理を実行
with open(input_file_path, 'r') as infile, open(output_file_path, 'w') as outfile:
for line in infile:
# 行を ': ' で分割し、2つ以上の要素がある場合のみ処理
parts = line.split(': ', 1)
if len(parts) > 1:
modified_line = parts[1] # `:` 以降の部分のみ取得
else:
modified_line = line # `:` がない場合はそのまま保持
# ラベルの変換:「四文字の種名_遺伝子名」→「四文字の種名」のみ
modified_line = re.sub(r"\b([A-Za-z]{4})_[^,():]+", r"\1", modified_line)
# 新しいファイルに書き込み
outfile.write(modified_line)
print("ツリーファイルの変換が完了しました:", output_file_path)これまでのmodify.pyと違って、遺伝子IDも除去するようにスクリプトを改造してある。
この出力のmodified_trees.nwkを使用する。
### ASTRAL.shの中身
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date
java -Xmx2G -jar astral.5.7.8.jar \
-i /home/kosukesano/tools/for_ASTRAL/Astral/data/250203_6sp/modified_trees.nwk \
-o /home/kosukesano/tools/for_ASTRAL/Astral/250203_6sp/out.tre \
2>/home/kosukesano/tools/for_ASTRAL/Astral/250203_6sp/out.log
dateこの出力がこれ
kosukesano@at139:~/tools/for_ASTRAL/Astral$ ls 250203_6sp/
out.log out.tre
kosukesano@at139:~/tools/for_ASTRAL/Astral$(Agra,((Sory,Tcas)1:0.8510176414897351,(Dpon,(Smad,Cass)0.89:0.0330270621831446)1:0.05640338444435427):0.0);0204
フェモラータゲノムのBRAKER、RNA-Seqデータ無し
フェモラータのBRAKERのクオリティが低いのはRNA-seqの個体とゲノムの元個体が違う産地で、ちゃんとくっつかなかったからでは?RNA-seqデータを抜いてやってみる。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 16
echo start at
date
source /home/kosukesano/tools/pyenv_env/braker_profile
braker.pl --genome=/home/kosukesano/tools/for_braker/nama_data/250127_Sfem_upper1000_masked.fasta\
--prot_seq=/home/kosukesano/tools/Arthropoda.fa\
--threads=16\
--species=250204_Sfemorata\
--AUGUSTUS_CONFIG_PATH=/usr/share/augustus/config\
--AUGUSTUS_BIN_PATH=/usr/bin\
--AUGUSTUS_SCRIPTS_PATH=/usr/share/augustus/scripts\
--GENEMARK_PATH=/home/kosukesano/tools/GeneMarkETP_git_install/GeneMark-ETP/bin\
--PROTHINT_PATH=/home/kosukesano/tools/ProtHint_git_install/ProtHint/bin\
--TSEBRA_PATH=/home/kosukesano/tools/TSEBRA_git_install/TSEBRA/bin
echo end at
dateRNA-Seqのインプットを抜いただけ
0205
RNA-seqデータを抜いたフェモラータのBRAKER結果・BUSCO
kosukesano@at138:~/tools/for_braker/250204_Sfem/braker$ ls
Augustus GeneMark-EP GeneMark-ES braker.aa braker.codingseq braker.gtf braker.log errors genome_header.map hintsfile.gff prothint.gff species what-to-cite.txt
kosukesano@at138:~/tools/for_braker/250204_Sfem/braker$
kosukesano@at138:~/tools/for_braker/250204_Sfem/braker$ singularity exec -e /usr/local/biotools/s/seqkit\:2.5.0--h9ee0642_0 seqkit stat braker.aa
file format type num_seqs sum_len min_len avg_len max_len
braker.aa FASTA Protein 17,544 7,428,397 6 423.4 21,411
kosukesano@at138:~/tools/for_braker/250204_Sfem/braker$シーケンス数が3000くらい増えてるね。
BUSCOの結果はこう
# BUSCO version is: 5.1.3
# The lineage dataset is: (Creation date: 2024-01-08, number of genomes: 90, number of BUSCOs: 1013)
# Summarized benchmarking in BUSCO notation for file /home/kosukesano/tools/for_braker/250204_Sfem/braker/braker.aa
# BUSCO was run in mode: proteins
***** Results: *****
C:92.2%[S:85.3%,D:6.9%],F:3.3%,M:4.5%,n:1013
934 Complete BUSCOs (C)
864 Complete and single-copy BUSCOs (S)
70 Complete and duplicated BUSCOs (D)
33 Fragmented BUSCOs (F)
46 Missing BUSCOs (M)
1013 Total BUSCO groups searched
Dependencies and versions:
hmmsearch: 3.1やっぱりRNAーseqのデータが良くなかったっぽいですね
OrthoFinder出力からSCOのCDSを取得
- 1.
ExOG.pyを実行 - 2.6種のCDSファイルをコピー
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/tools/for_braker/241129_madara/braker/braker.codingseq Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1
Anthonomus_buscotest.sh Anthonomus_buscotest.sh.o25642658 Anthonomus_buscotest.sh.po25642658 busco_downloads cds_from_genomic.fna protein.faa sequence_report.jsonl
Anthonomus_buscotest.sh.e25642658 Anthonomus_buscotest.sh.pe25642658 GCF_022605725.1_icAntGran1.3_genomic.fna busco_out genomic.gff rna.fna
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/other_weevil/Anthonomus_grandis_grandis/ncbi_dataset/data/GCF_022605725.1/cds_from_genomic.fna Agra.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1
Ceutorhynchus_buscotest.sh Ceutorhynchus_buscotest.sh.o25642655 Ceutorhynchus_buscotest.sh.po25642655 busco_downloads cds_from_genomic.fna protein.faa
Ceutorhynchus_buscotest.sh.e25642655 Ceutorhynchus_buscotest.sh.pe25642655 GCA_917834065.1_PGI_CEUTPL_v4_genomic.fna busco_out genomic.gff sequence_report.jsonl
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/other_weevil/Ceutorhynchus_assimilis/ncbi_dataset/data/GCA_917834065.1/cds_from_genomic.fna Cass.
fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta Cass.fasta Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta Cass.fasta Smad.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1
GCF_002938485.1_Soryzae_2.0_genomic.fna Soryzae_busco.sh.e26203344 Soryzae_busco.sh.pe26203344 busco_downloads cds_from_genomic.fna genomic.gff protein.faa sequence_report.jsonl
Soryzae_busco.sh Soryzae_busco.sh.o26203344 Soryzae_busco.sh.po26203344 busco_out genomic.gbff genomic.gtf rna.fna
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1/cds_from_genomic.fna
cp: missing destination file operand after '/home/kosukesano/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1/cds_from_genomic.fna'
Try 'cp --help' for more information.
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/other_weevil/Soryzae/ncbi_dataset/data/GCF_002938485.1/cds_from_genomic.fna Sory.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta Cass.fasta Smad.fasta Sory.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/ronbun_sp/Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1
GCF_020466585.1_Dpon_F_20191213v2_genomic.fna cds_from_genomic.fna genomic.gff protein.faa rna.fna sequence_report.jsonl
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/ronbun_sp/Dendroctonus_ponderosae/ncbi_dataset/data/GCF_020466585.1/cds_from_genomic.fna Dpon.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta Cass.fasta Dpon.fasta Smad.fasta Sory.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls ~/old_envilonment_until20240430/outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3
GCF_000002335.3_Tcas5.2_genomic.fna Tribolium_buscotest.sh.o25642647 busco_downloads genomic.gff rna.fna test.faa
Tribolium_buscotest.sh Tribolium_buscotest.sh.pe25642647 busco_out output.faa sequence_report.jsonl test.gff
Tribolium_buscotest.sh.e25642647 Tribolium_buscotest.sh.po25642647 cds_from_genomic.fna protein.faa test
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ cp ~/old_envilonment_until20240430/outgroup/Tribolium_castaneum/ncbi_dataset/data/GCF_000002335.3/cds_from_genomic.fna Tcas.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$ ls
Agra.fasta Cass.fasta Dpon.fasta Smad.fasta Sory.fasta Tcas.fasta
kosukesano@at139:~/tools/for_paml/data/250205_6sp/nama_data$- 3.ch.hed.pyを実行
- 4.edit.pyを実行
- 5.new_makefna.pyを実行
- 6.mafft.shをqsubで実行
- 7.fix.pyを実行
0206
PAMLの実行
昨日までの前処理がうまくいき、/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDSディレクトリに*_maffted_fixed.fnaファイルができた。これを使ってPAMLのBranch-Siteモデルにかける。
まず~/tools/for_paml/250206_6spディレクトリを作成し、その下にbsAとbs_nullディレクトリを新たに作成。
kosukesano@at139:~/tools/for_paml/250206_6sp$ ls
bsA bs_null
kosukesano@at139:~/tools/for_paml/250206_6sp$ bsAについて
~/tools/for_paml/250206_6sp/bsAディレクトリ直下でbsA_paml.shとtemplate.ctlを作成、bsA_paml.shをqsubで投げた。
### bsA_paml.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDS"
bsA_dir="/home/kosukesano/tools/for_paml/250206_6sp/bsA"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_branch_alt"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
done### template.ctl
seqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/data/250205_6sp/out.tre
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 0
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 0インプットのツリーファイルは~/tools/for_ASTRAL/Astral/250203_6sp/のASTRAL出力のものをコピーした。
bs_nullについて
同じくファイルを作成、qsubで投げた。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
# ディレクトリの設定
input_dir="/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDS"
bsA_dir="/home/kosukesano/tools/for_paml/250206_6sp/bs_null"
result_dir="$bsA_dir/result"
template_ctl="$bsA_dir/bsN_template.ctl"
# 出力ディレクトリが存在しない場合は作成
mkdir -p "$result_dir"
# テンプレートの制御ファイルを読み込む
ctl_template=$(cat "$template_ctl")
# ディレクトリ内の_maffted_fixed.fastaファイルを処理
for file in "$input_dir"/*_maffted_fixed.fna; do
if [[ -f "$file" ]]; then
base_name=$(basename "$file" .fna)
outfile_path="$result_dir/${base_name}_branch_alt"
# 一時的な制御ファイルの内容を生成
ctl_content="${ctl_template//<SEQFILE>/$file}"
ctl_content="${ctl_content//<OUTFILE>/$outfile_path}"
# 一時的な制御ファイルを作成
ctl_path="$bsA_dir/bsA.ctl"
echo "$ctl_content" > "$ctl_path"
# PAMLを実行
singularity exec -e /usr/local/biotools/p/paml:4.9--h779adbc_6 codeml "$ctl_path"
echo "Processed file: $file, output: $outfile_path"
fi
doneseqfile = <SEQFILE>
treefile = /home/kosukesano/tools/for_paml/data/250205_6sp/out.tre
outfile = <OUTFILE>
noisy = 9
verbose = 1
runmode = 0
seqtype = 1
CodonFreq = 2
clock = 0
model = 2
NSsites = 2
fix_omega = 1
omega = 1
icode = 0
fix_kappa = 0
kappa = 2
fix_alpha = 1
alpha = .0
Malpha = 0
ncatG = 4
getSE = 0
RateAncestor = 0
method = 0
fix_blength = 00207
trimalによるCDSのトリミング
なんかPAMLうまく解析出来てないんだよなあと思ったらトリミングできてなかった。MAFFTをかけたファイルについて、ギャップの部分をトリミングしておく。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6
source ~/tools/pyenv_env/ManualPhilo_profile
# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDS/"
output_dir="/home/kosukesano/tools/for_paml/data/250205_6sp/SCO_CDS/"
# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*_maffted_fixed.fna; do
# 元のファイル名から拡張子を除いたものを取得
base_name=$(basename "$file" _maffted_fixed.fna)
# 出力ファイル名を生成
output_file="${output_dir}${base_name}_trimed.fna"
output_html="${output_dir}${base_name}_trimed.html"
# MAFFTを実行
mafft --auto --maxiterate 1000 --localpair "$file" > "$output_file"
trimal -in "$file" -out "$output_file" -htmlout "$output_html" -gt 0.9 -cons 60
echo "trimed file created: $output_file"
doneこれをqsubで投げた。-gt 0.9は-gt 1とかにした方がいいかも。
PAML続き
ツリーをout.log.treeに変更
(Cass,(Smad#1,(Dpon,(Agra,(Sory,Tcas)))));0210
PAML続き
### bs_lrp.py
import os
import re
from scipy.stats import chi2
def parse_lnL(file_path):
with open(file_path, 'r') as f:
for line in f:
match = re.search(r'lnL\(ntime: \d+ np: (\d+)\):\s+(-?\d+\.\d+)', line)
if match:
np = int(match.group(1))
lnL = float(match.group(2))
return np, lnL
return None, None
def perform_lrt(alt_lnL, alt_np, null_lnL, null_np):
lr_stat = 2 * (alt_lnL - null_lnL)
df = alt_np - null_np
p_val = chi2.sf(lr_stat, df)
return p_val
def main():
alt_dir = '/home/kosukesano/tools/for_paml/250207_6sp/bsA/result'
null_dir = '/home/kosukesano/tools/for_paml/250207_6sp/bs_null/result'
output_file = 'branch_site_lrt_results.txt'
alt_dir = os.path.expanduser(alt_dir)
null_dir = os.path.expanduser(null_dir)
og_files = [f for f in os.listdir(alt_dir) if '_trimed_branch_alt' in f]
with open(output_file, 'w') as out_f:
out_f.write('OG_num\tp_val\tpositive_selection\n')
for og_file in og_files:
og_num = og_file.split('_')[0]
alt_file = os.path.join(alt_dir, og_file)
null_file = os.path.join(null_dir, og_file)
if os.path.exists(null_file):
alt_np, alt_lnL = parse_lnL(alt_file)
null_np, null_lnL = parse_lnL(null_file)
if alt_np is not None and null_np is not None:
p_val = perform_lrt(alt_lnL, alt_np, null_lnL, null_np)
reject_null = '+' if p_val < 0.05 else '-'
out_f.write(f'{og_num}\t{p_val}\t{reject_null}\n')
if __name__ == "__main__":
main()結果はこう
kosukesano@at138:~/tools/for_paml/250207_6sp$ grep -io + branch_site_lrt_results.txt | wc -l
188
kosukesano@at138:~/tools/for_paml/250207_6sp$ 188遺伝子で正の選択を検出!
マダラの新しいゲノムに機能アノテーションを移植する
以下のスクリプトを実行した。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 12
echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID
echo starting at
date
#BLASTの標準列名を定義
header="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"
#出力ファイルの定義
output_file="/home/kosukesano/reference_sequence/250210_Fnc_anno/out_madara_blastp_test.txt"
#列名を出力ファイルに書き込む
echo "$header" > $output_file
#BLASTpを実行し、結果を追加する
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query /home/kosukesano/reference_sequence/250210_Fnc_anno/241129_madara_iso1.aa \
-db /home/kosukesano/reference_sequence/Sory_Tcas_Dmel_Ecol_ref/merge_4sp \
-evalue 1e-04 \
-outfmt 6 >> $output_file
echo ending at
datetRNA関連遺伝子の抽出
テスト
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ nano e2.fasta
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ cp ~/tools/for_braker/241129_madara/241129_madara_iso1.aa ../blast_test/
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ ls
241129_madara_iso1.aa e2.fasta
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ singularity exec -e /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 makeblastdb -in 241129_madara_iso1.aa -out madara -dbtype prot -
hash_index
WARNING: Skipping mount /opt/pkg/singularity-ce/4.0.0/var/singularity/mnt/session/etc/resolv.conf [files]: /etc/resolv.conf doesn't exist in container
Building a new DB, current time: 02/10/2025 15:09:24
New DB name: /home/kosukesano/tools/for_paml/250210_tRNA/blast_test/madara
New DB title: 241129_madara_iso1.aa
Sequence type: Protein
Keep MBits: T
Maximum file size: 1000000000B
Adding sequences from FASTA; added 12337 sequences in 0.402422 seconds.
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$ ls
241129_madara_iso1.aa e2.fasta madara.phd madara.phi madara.phr madara.pin madara.pog madara.psd madara.psi madara.psq
kosukesano@at138:~/tools/for_paml/250210_tRNA/blast_test$0211
tRNA修飾とCK合成に関わる遺伝子の検出
先行研究に記載のあった遺伝子e1~e17について、NCBIのタンパク質IDを元にアミノ酸配列を取得。これをクエリーにしてマダラのゲノムにblastp検索をかけた。
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6
echo "pwd: $(pwd)"
echo HOME: $HOME
echo USER: $USER
echo JOB_ID: $JOB_ID
echo starting at
date
#BLASTの標準列名を定義
header="qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore"
#出力ファイルの定義
output_file="/home/kosukesano/tools/for_paml/250210_tRNA/blast_test/tRNAgene_out.txt"
#列名を出力ファイルに書き込む
echo "$header" > $output_file
#BLASTpを実行し、結果を追加する
singularity exec --bind /usr/local/seq /usr/local/biotools/b/blast:2.9.0--pl526h979a64d_3 blastp \
-query /home/kosukesano/tools/for_paml/250210_tRNA/blast_test/tRNAgene.fasta \
-db /home/kosukesano/tools/for_paml/250210_tRNA/blast_test/madara \
-evalue 1e-04 \
-outfmt 6 >> $output_file
echo ending at
date結果は以下の通り。
qseqid sseqid pident length mismatch gapopen qstart qend sstart send evalue bitscore
e1_1 g6098.t1 26.829 123 77 4 1 123 5 114 8.69e-06 45.4
e1_2 g6098.t1 36.792 106 66 1 67 172 7 111 9.27e-22 94.7
e2 g6098.t1 44.875 439 207 5 1 434 1 409 8.39e-115 343
e3 g1799.t1 34.559 408 236 4 2 380 106 511 1.61e-74 243
e3 g6011.t1 27.494 451 277 13 4 433 67 488 3.28e-40 150
e5 g8991.t1 60.000 580 227 3 216 791 67 645 0.0 756
e5 g2065.t1 26.892 502 301 17 213 655 35 529 5.10e-32 130
e8 g6207.t1 53.846 286 124 2 88 367 43 326 2.62e-112 330
e8 g6208.t1 50.877 285 138 1 83 367 20 302 1.85e-103 306
e8 g405.t1 23.904 251 165 11 125 364 15 250 1.28e-07 51.2
e9 g1876.t1 55.977 343 150 1 2 343 3 345 2.56e-147 418
e9 g3375.t1 33.537 328 209 5 14 336 11 334 2.15e-51 172
e10 g11508.t1 52.247 178 85 0 5 182 10 187 3.82e-56 174
e12 g4172.t1 26.136 176 93 7 264 430 206 353 1.62e-08 55.5
e12 g6393.t1 28.099 121 63 3 264 381 291 390 2.05e-08 55.1
e12 g396.t1 30.000 110 68 3 328 437 340 440 4.85e-08 53.9
.
.
.
.
.先行研究では昆虫のゲノムで検出されなかったとしているe1やe12~のタンパク質もヒットしているが、bitscoreやevalueの値が悪そう。
とりあえず機能やオーソログ情報を紐付けてみよう。
# A tibble: 68 × 11
qseqid sseqid evalue Sory_GeneFunction Orthogroup Agra_iso1 Cass_iso1
<chr> <chr> <dbl> <chr> <chr> <chr> <chr>
1 e1_1 g6098.t1 8.69e- 6 tRNA dimethylallylt… OG0003486 Agra_P_0… Cass_AG9…
2 e1_2 g6098.t1 9.27e- 22 tRNA dimethylallylt… OG0003486 Agra_P_0… Cass_AG9…
3 e2 g6098.t1 8.39e-115 tRNA dimethylallylt… OG0003486 Agra_P_0… Cass_AG9…
4 e3 g1799.t1 1.61e- 74 CDK5RAP1-like prote… OG0008328 Agra_P_0… Cass_AG9…
5 e3 g6011.t1 3.28e- 40 threonylcarbamoylad… OG0005024 Agra_P_0… Cass_AG9…
6 e5 g8991.t1 0 protein 5NUC OG0004173 Agra_P_0… Cass_AH1…
7 e5 g2065.t1 5.10e- 32 protein 5NUC-like, … OG0001251 Agra_P_0… Cass_AG9…
8 e8 g6207.t1 2.62e-112 purine nucleoside p… OG0000755 Agra_P_0… Cass_AG9…
9 e8 g6208.t1 1.85e-103 purine nucleoside p… OG0000755 Agra_P_0… Cass_AG9…
10 e8 g405.t1 1.28e- 7 S-methyl-5'-thioade… OG0000587 Agra_P_0… Cass_AG9…
# ℹ 58 more rows
# ℹ 4 more variables: Dpon_iso1 <chr>, Smad_iso1 <chr>, Sory_iso1 <chr>,
# Tcas_iso1 <chr>
0212
PRANKによるコドンアライメント
PAMLの結果がうまく出なかった。どうもコドンベースでアライメントされておらず、3の倍数になってないものがあるみたい。
PRANKを使用しコドンアライメントを行う。
### ~/tools/for_paml/data/250211_6sp/prank.sh
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6
# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/data/250211_6sp/PRANK_CDS/"
output_dir="/home/kosukesano/tools/for_paml/data/250211_6sp/PRANK_CDS/"
# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
# 元のファイル名から拡張子を除いたものを取得
base_name=$(basename "$file" .fna)
# 出力ファイル名を生成
output_file="${output_dir}${base_name}_pranked.fna"
# prankを実行
singularity exec -e /usr/local/biotools/p/prank\:v.170427--h9f5acd7_6 prank -d="$file" -o="$output_file" -codon -F
echo "Aligned file created: $output_file"
doneCK生合成に関わる遺伝子のPAML
~/tools/for_paml/250210_tRNAを作成、この下のnama_data/にOrthoFinder出力のオーソログ.fastaファイルと遺伝子系統樹のファイルを置いた。
kosukesano@at139:~/tools/for_paml/250210_tRNA$ ls nama_data/
OG0000120.fa OG0000572_tree.txt OG0000755.fa OG0001494_tree.txt OG0003982.fa OG0005024_tree.txt OG0008328.fa OG0010036_tree.txt
OG0000120_tree.txt OG0000584.fa OG0000755_tree.txt OG0003285.fa OG0003982_tree.txt OG0006816.fa OG0008328_tree.txt
OG0000203.fa OG0000584_tree.txt OG0001251.fa OG0003285_tree.txt OG0004173.fa OG0006816_tree.txt OG0009811.fa
OG0000203_tree.txt OG0000587.fa OG0001251_tree.txt OG0003486.fa OG0004173_tree.txt OG0008087.fa OG0009811_tree.txt
OG0000572.fa OG0000587_tree.txt OG0001494.fa OG0003486_tree.txt OG0005024.fa OG0008087_tree.txt OG0010036.fa
kosukesano@at139:~/tools/for_paml/250210_tRNA$~/tools/for_paml/250210_tRNAでExOG.pyを実行した。
### ~/tools/for_paml/250210_tRNA/ExOG.py
# ファイルパスの設定
orthogroups_file_path = '/home/kosukesano/tools/for_orthofinder/250128_6sp_iso1/Change_hedder/OrthoFinder/Results_Jan28/Orthogroups/Orthogroups.txt'
single_copy_orthologues_file_path = '/home/kosukesano/tools/for_paml/250210_tRNA/CK_OG_no.txt'
output_file_path = '/home/kosukesano/tools/for_paml/250210_tRNA/extracted_orthogroups.txt'
# シングルコピーオルソログのIDをセットに格納
single_copy_orthologues = set()
with open(single_copy_orthologues_file_path, 'r') as single_copy_file:
for line in single_copy_file:
single_copy_orthologues.add(line.strip())
# Orthogroups.txt から該当する行を抽出して新しいファイルに保存
with open(orthogroups_file_path, 'r') as orthogroups_file, open(output_file_path, 'w') as output_file:
for line in orthogroups_file:
# 行の最初の部分を取り出してIDをチェック
og_id = line.split(':')[0].strip()
if og_id in single_copy_orthologues:
output_file.write(line)続いて、makefna.pyを実行した。
### ~/tools/for_paml/250210_tRNA/makefna.pyの中身
import os
# ファイルパスの設定
orthogroups_file = "/home/kosukesano/tools/for_paml/250210_tRNA/extracted_orthogroups.txt"
input_dir = "/home/kosukesano/tools/for_paml/data/250205_6sp/changehedder/kansei/"
output_dir = "/home/kosukesano/tools/for_paml/250210_tRNA/SCO_CDS/"
# ディレクトリが存在しない場合、作成
os.makedirs(output_dir, exist_ok=True)
# 種名とプレフィックスの対応辞書
species_prefix = {
"Agra": "Agra",
"Cass": "Cass",
"Dpon": "Dpon",
"Smad": "Smad",
"Sory": "Sory",
"Tcas": "Tcas"
}
# 各種のFASTAファイルを辞書に格納
fasta_files = {species: os.path.join(input_dir, f"{species}_changehedder.fasta") for species in species_prefix}
# OG番号と遺伝子IDをextracted_orthogroups.txtから取得
with open(orthogroups_file, "r") as ortho_f:
for line in ortho_f:
if line.strip(): # 空行を無視
# 行をOG番号と遺伝子IDリストに分割
og_number, gene_ids_str = line.split(":")
og_number = og_number.strip()
gene_ids = gene_ids_str.strip().split()
# 出力ファイルのパス
output_file = os.path.join(output_dir, f"{og_number}.fna")
# 出力ファイルを開く
with open(output_file, "w") as out_f:
for gene_id in gene_ids:
species = None
prefix = gene_id[:4]
# 種類の判定
for key, sp in species_prefix.items():
if prefix.startswith(sp):
species = sp
break
if species and species in fasta_files:
fasta_file = fasta_files[species]
with open(fasta_file, "r") as fasta_f:
write_flag = False
for fasta_line in fasta_f:
if fasta_line.startswith(f">{gene_id}"):
out_f.write(fasta_line)
print(fasta_line.strip())
write_flag = True
elif fasta_line.startswith(">") and write_flag:
write_flag = False
elif write_flag:
out_f.write(fasta_line)
print(fasta_line.strip())
print(f"{og_number}.fna ファイルが {output_dir} に保存されました。")MCOにも対応した最新使用。
これで出力されたファイルに対してPRANKでアライメントを行う
#$ -S /bin/bash
#$ -cwd
#$ -l gpu
#$ -pe def_slot 6
# ディレクトリパス
input_dir="/home/kosukesano/tools/for_paml/250210_tRNA/SCO_CDS/"
output_dir="/home/kosukesano/tools/for_paml/250210_tRNA/PRANK_CDS/"
# 各ファイルに対してアラインメントを実行
for file in "$input_dir"*.fna; do
# 元のファイル名から拡張子を除いたものを取得
base_name=$(basename "$file" .fna)
# 出力ファイル名を生成
output_file="${output_dir}${base_name}_pranked.fna"
# prankを実行
singularity exec -e /usr/local/biotools/p/prank\:v.170427--h9f5acd7_6 prank -d="$file" -o="$output_file" -codon -F
echo "Aligned file created: $output_file"
doneまた、OrthoFinder出力の遺伝子系統樹には種名の「Tcas_iso1_」のような接頭辞が葉のラベルにくっついていた。これを切り取る。
### ~/tools/for_paml/250210_tRNA/tree_edit.py
import os
import glob
# 処理対象のディレクトリ
directory = "/home/kosukesano/tools/for_paml/250210_tRNA/nama_data"
# 削除する文字列のリスト
remove_strings = ["Tcas_iso1_", "Smad_iso1_", "Dpon_iso1_",
"Cass_iso1_", "Agra_iso1_", "Sory_iso1_"]
# _tree.txt で終わるファイルを取得
tree_files = glob.glob(os.path.join(directory, "*_tree.txt"))
# 各ファイルを処理
for file_path in tree_files:
with open(file_path, "r") as f:
content = f.read()
# 指定の文字列をすべて除去
for remove_str in remove_strings:
content = content.replace(remove_str, "")
# ファイルを上書き保存
with open(file_path, "w") as f:
f.write(content)
print(f"Processed: {file_path}")
print("すべてのファイルを処理しました。")3の倍数になるようにNをつける
PRANKは入力に使う配列が3の倍数じゃないとエラー起こしちゃうっぽい。
これを防ぐには、配列の最後にNを加えて3の倍数にしてあげればいい。
### ~/tools/for_paml/250210_tRNA/plusN.py
from pathlib import Path
def adjust_fasta_length(input_file, output_file):
"""FASTAファイルの配列を3の倍数に調整し、Nを追加する"""
with open(input_file, "r") as infile, open(output_file, "w") as outfile:
header = ""
sequence = ""
for line in infile:
line = line.strip()
if line.startswith(">"): # ヘッダー行
if header and sequence: # 既存のデータがある場合は処理
remainder = len(sequence) % 3
if remainder != 0:
sequence += "N" * (3 - remainder) # N を追加
outfile.write(header + "\n" + sequence + "\n")
header = line # 新しいヘッダーを保存
sequence = "" # 新しいシーケンスをリセット
else:
sequence += line # 配列データを連結
# 最後のシーケンスを処理
if header and sequence:
remainder = len(sequence) % 3
if remainder != 0:
sequence += "N" * (3 - remainder)
outfile.write(header + "\n" + sequence + "\n")
def process_all_fasta_files(input_dir, output_dir):
"""input_dir 内のすべての .fna ファイルを処理し、output_dir に保存"""
input_path = Path(input_dir)
output_path = Path(output_dir)
# 出力ディレクトリが存在しない場合は作成
output_path.mkdir(parents=True, exist_ok=True)
# .fna ファイルを処理
for fasta_file in input_path.glob("*.fna"):
output_file = output_path / fasta_file.name # 出力ファイルのパス
adjust_fasta_length(fasta_file, output_file)
print(f"処理完了: {output_file}")
# 実行設定
input_directory = "/home/kosukesano/tools/for_paml/250210_tRNA/SCO_CDS"
output_directory = "/home/kosukesano/tools/for_paml/250210_tRNA/SCO_CDS_plusN"
process_all_fasta_files(input_directory, output_directory)
print("全ての処理が完了しました。")0213
3の倍数になるようにNをつけた後のPRANK結果
kosukesano@at138:~/tools/for_paml/250210_tRNA$ ls PRANK_CDS_plusN/
OG0000120_pranked.fna.best.fas OG0000755_pranked.fna.best.fas OG0003982_pranked.fna.best.fas OG0008328_pranked.fna.best.fas
OG0000203_pranked.fna.best.fas OG0001251_pranked.fna.best.fas OG0004173_pranked.fna.best.fas OG0009811_pranked.fna.best.fas
OG0000572_pranked.fna.best.fas OG0001494_pranked.fna.best.fas OG0005024_pranked.fna.best.fas OG0010036_pranked.fna.best.fas
OG0000584_pranked.fna.best.fas OG0003285_pranked.fna.best.fas OG0006816_pranked.fna.best.fas
OG0000587_pranked.fna.best.fas OG0003486_pranked.fna.best.fas OG0008087_pranked.fna.best.fas
kosukesano@at138:~/tools/for_paml/250210_tRNA$ できてる!
CK合成に関わるオーソログの系統樹に#1をつける
OG0000120
# A tibble: 8 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e12 g6393.t1 OG0000120 2.05e- 8 UDP-glucuronosyltransferase 2C1-like isof…
2 e12 g6392.t1 OG0000120 1.99e- 7 UDP-glucuronosyltransferase 2C1-like isof…
3 e14 g6393.t1 OG0000120 1.32e-11 UDP-glucuronosyltransferase 2C1-like isof…
4 e14 g6392.t1 OG0000120 2.97e-11 UDP-glucuronosyltransferase 2C1-like isof…
5 e15 g6393.t1 OG0000120 8.32e- 7 UDP-glucuronosyltransferase 2C1-like isof…
6 e15 g6392.t1 OG0000120 2.13e- 6 UDP-glucuronosyltransferase 2C1-like isof…
7 e16 g6392.t1 OG0000120 9.32e- 8 UDP-glucuronosyltransferase 2C1-like isof…
8 e16 g6393.t1 OG0000120 2.19e- 7 UDP-glucuronosyltransferase 2C1-like isof…
(g6392, g6393)に#1を振った。
OG0000203
# A tibble: 7 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e12 g9319.t1 OG0000203 0.00000438 UDP-glucuronosyltransferase 1-9-like
2 e12 g9322.t1 OG0000203 0.00000451 UDP-glucuronosyltransferase 1-9-like
3 e12 g9321.t1 OG0000203 0.0000226 UDP-glucuronosyltransferase 1-9-like
4 e14 g9321.t1 OG0000203 0.00000126 UDP-glucuronosyltransferase 1-9-like
5 e14 g9319.t1 OG0000203 0.00000275 UDP-glucuronosyltransferase 1-9-like
6 e14 g9322.t1 OG0000203 0.0000028 UDP-glucuronosyltransferase 1-9-like
7 e16 g9319.t1 OG0000203 0.0000215 UDP-glucuronosyltransferase 1-9-like
(g9321, g9322)、g9319に#1を振った。別でg9323(アノテーションのつかなかったほう)だけの解析をしてもいいかも。
OG0000572
# A tibble: 8 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e12 g6542.t1 OG0000572 1.13e- 7 UDP-glucuronosyltransferase 2B13-like
2 e12 g6541.t1 OG0000572 5.47e- 6 UDP-glucuronosyltransferase 2B13-like
3 e13 g6542.t1 OG0000572 9.55e- 6 UDP-glucuronosyltransferase 2B13-like
4 e14 g6542.t1 OG0000572 4.42e-11 UDP-glucuronosyltransferase 2B13-like
5 e14 g6541.t1 OG0000572 7.08e-10 UDP-glucuronosyltransferase 2B13-like
6 e15 g6542.t1 OG0000572 6.22e- 8 UDP-glucuronosyltransferase 2B13-like
7 e16 g6542.t1 OG0000572 4.22e-10 UDP-glucuronosyltransferase 2B13-like
8 e16 g6541.t1 OG0000572 7.21e- 8 UDP-glucuronosyltransferase 2B13-like
g6541に#1を振った。
OG0000584
# A tibble: 10 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e12 g396.t1 OG0000584 0.0000000485 2-hydroxyacylsphingosine 1-beta-gala…
2 e12 g8427.t1 OG0000584 0.000000177 2-hydroxyacylsphingosine 1-beta-gala…
3 e13 g8427.t1 OG0000584 0.00000362 2-hydroxyacylsphingosine 1-beta-gala…
4 e13 g396.t1 OG0000584 0.00000391 2-hydroxyacylsphingosine 1-beta-gala…
5 e14 g396.t1 OG0000584 0.0000000649 2-hydroxyacylsphingosine 1-beta-gala…
6 e14 g8427.t1 OG0000584 0.000000872 2-hydroxyacylsphingosine 1-beta-gala…
7 e15 g396.t1 OG0000584 0.000000156 2-hydroxyacylsphingosine 1-beta-gala…
8 e15 g8427.t1 OG0000584 0.000000271 2-hydroxyacylsphingosine 1-beta-gala…
9 e16 g396.t1 OG0000584 0.000000283 2-hydroxyacylsphingosine 1-beta-gala…
10 e16 g8427.t1 OG0000584 0.000000403 2-hydroxyacylsphingosine 1-beta-gala…
g8427, g396に#1を振った
OG0000587
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e8 g405.t1 OG0000587 0.000000128 S-methyl-5'-thioadenosine phosphorylase…
g405に#1を振った。別でg6550, g6551(アノテーションのつかなかったほう)だけの解析をしてもいいかも。
OG0000755
# A tibble: 2 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e8 g6207.t1 OG0000755 2.62e-112 purine nucleoside phosphorylase-like iso…
2 e8 g6208.t1 OG0000755 1.85e-103 purine nucleoside phosphorylase-like iso…
g6027, g6028に#1を振った
OG0001251
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e5 g2065.t1 OG0001251 5.10e-32 protein 5NUC-like, partial
g2065に#1を振った
*OG0001494
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e9 g1876.t1 OG0001494 2.56e-147 adenosine kinase
g1876に#1を振った
OG0003285
# A tibble: 16 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e12 g4172.t1 OG0003285 1.62e- 8 UDP-glucuronosyltransferase 2B15-like
2 e12 g4169.t1 OG0003285 8.65e- 6 UDP-glucuronosyltransferase 2B15-like
3 e12 g6414.t2 OG0003285 1.06e- 5 UDP-glucuronosyltransferase 2B15-like
4 e13 g4172.t1 OG0003285 4.40e- 6 UDP-glucuronosyltransferase 2B15-like
5 e14 g4172.t1 OG0003285 2.02e-10 UDP-glucuronosyltransferase 2B15-like
6 e14 g4170.t1 OG0003285 3.56e- 8 UDP-glucuronosyltransferase 2B15-like
7 e14 g4171.t1 OG0003285 4.50e- 8 UDP-glucuronosyltransferase 2B15-like
8 e14 g6414.t2 OG0003285 8.37e- 8 UDP-glucuronosyltransferase 2B15-like
9 e14 g4169.t1 OG0003285 1.88e- 7 UDP-glucuronosyltransferase 2B15-like
10 e15 g4172.t1 OG0003285 5.09e- 8 UDP-glucuronosyltransferase 2B15-like
11 e15 g4169.t1 OG0003285 5.08e- 6 UDP-glucuronosyltransferase 2B15-like
12 e16 g4172.t1 OG0003285 2.64e- 8 UDP-glucuronosyltransferase 2B15-like
13 e16 g4170.t1 OG0003285 5.03e- 8 UDP-glucuronosyltransferase 2B15-like
14 e16 g4171.t1 OG0003285 1.32e- 7 UDP-glucuronosyltransferase 2B15-like
15 e16 g4169.t1 OG0003285 8.37e- 7 UDP-glucuronosyltransferase 2B15-like
16 e16 g6414.t2 OG0003285 6.89e- 6 UDP-glucuronosyltransferase 2B15-like
マダラ枝基部に#1を振った
OG0003486
# A tibble: 3 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e1_1 g6098.t1 OG0003486 8.69e- 6 tRNA dimethylallyltransferase isoform X1
2 e1_2 g6098.t1 OG0003486 9.27e- 22 tRNA dimethylallyltransferase isoform X1
3 e2 g6098.t1 OG0003486 8.39e-115 tRNA dimethylallyltransferase isoform X1
g6098に#1を振った
OG003982
# A tibble: 2 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e12 g1104.t1 OG0003982 0.00000616 2-hydroxyacylsphingosine 1-beta-gala…
2 e14 g1104.t1 OG0003982 0.00000000168 2-hydroxyacylsphingosine 1-beta-gala…
g1104に#1を振った
OG0004173
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e5 g8991.t1 OG0004173 0 protein 5NUC
g8991に#1を振った
OG0005024
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e3 g6011.t1 OG0005024 3.28e-40 threonylcarbamoyladenosine tRNA methylthi…
g6011に#1を振った
OG0006816
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e17 g5362.t1 OG0006816 0.0000000268 delta(24)-sterol reductase-like
g5362に#1を振った
OG0008087
# A tibble: 3 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e13 g11571.t1 OG0008087 0.0000485 UDP-glucuronosyltransferase 2C1-like
2 e14 g11571.t1 OG0008087 0.0000119 UDP-glucuronosyltransferase 2C1-like
3 e16 g11571.t1 OG0008087 0.0000000322 UDP-glucuronosyltransferase 2C1-like
g11571に#1を振った
OG0008328
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e3 g1799.t1 OG0008328 1.61e-74 CDK5RAP1-like protein
g1799に#1を振った
OG0009811
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e9 g3375.t1 OG0009811 2.15e-51 adenosine kinase-like
g3375に#1を振った
OG0010036
# A tibble: 1 × 5
qseqid sseqid Orthogroup evalue Sory_GeneFunction
<chr> <chr> <chr> <dbl> <chr>
1 e10 g11508.t1 OG0010036 3.82e-56 adenine phosphoribosyltransferase
g11508に#1を振った